**Import Packages & Data**

In [14]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [8]:
#Import CSV File of Cleaned Data: Test Scores
df = pd.read_csv('joined_df_cleaned.csv')


**Create Dummy Features**

In [9]:

# Identify categorical columns (replace with actual columns from your DataFrame)
categorical_cols = ['Jurisdiction']  # Add other categorical columns as needed

# Create dummy variables
dummy_df = pd.get_dummies(df[categorical_cols])

# Concatenate dummy variables with original DataFrame
df = pd.concat([df, dummy_df], axis=1)

# Drop the original categorical columns if needed
df.drop(categorical_cols, axis=1, inplace=True)

# Display the updated DataFrame
print(df.head())

   Average Math Score  ELL_x  White_x  Black_x  Hispanic_x  Low SES_x  Male_x  \
0               236.0  216.0    246.0    217.0       224.0      223.0   239.0   
1               230.0  216.0    240.0    213.0       222.0      218.0   233.0   
2               226.0  204.0    242.0      NaN       228.0      213.0   224.0   
3               232.0  197.0    247.0    215.0       221.0      218.0   235.0   
4               228.0  205.0    236.0    207.0       219.0      221.0   230.0   

   Female_x  ESE_x  Average Reading Score  ELL_y  White_y  Black_y  \
0     233.0  212.0                    217  190.0      227    199.0   
1     228.0  211.0                    213  196.0      224    197.0   
2     228.0  201.0                    204  187.0      220      NaN   
3     229.0  206.0                    215  164.0      229    205.0   
4     227.0  199.0                    212  174.0      221    188.0   

   Hispanic_y  Low SES_y  Male_y  Female_y  ESE_y  
0       205.0        203     214       2

**Standardize the magnitude of numeric features using a scaler**

In [12]:
# Identify numeric columns (assuming all except categorical columns)
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the numeric data
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Display the updated DataFrame with standardized values
print(df.head())

   Average Math Score     ELL_x   White_x   Black_x  Hispanic_x  Low SES_x  \
0            0.279402  0.430210  0.509269  0.424220    0.149285   0.237934   
1           -0.924781  0.430210 -0.628222 -0.234763   -0.184002  -0.826508   
2           -1.727570 -1.040326 -0.249059       NaN    0.815859  -1.890950   
3           -0.523387 -1.898139  0.698851  0.094729   -0.350646  -0.826508   
4           -1.326176 -0.917781 -1.386550 -1.223238   -0.683933  -0.187843   

     Male_x  Female_x     ESE_x  Average Reading Score     ELL_y   White_y  \
0  0.369522  0.166577  0.411312               0.249935  0.342933  0.371738   
1 -0.739043 -0.895352  0.248700              -0.546733  1.005310 -0.159813   
2 -2.401890 -0.895352 -1.377418              -2.339237  0.011744 -0.868548   
3 -0.369522 -0.682966 -0.564359              -0.148399 -2.527368  0.726106   
4 -1.293325 -1.107737 -1.702641              -0.745900 -1.423406 -0.691364   

   Black_y  Hispanic_y  Low SES_y    Male_y  Female_y     ESE_

**Split into testing and training datasets**

In [18]:
# Define features (X) and target (y)
features = ['ELL_x', 'White_x', 'Black_x', 'Hispanic_x', 'Low SES_x', 'Male_x', 'Female_x', 'ESE_x',
            'ELL_y', 'White_y', 'Black_y', 'Hispanic_y', 'Low SES_y', 'Male_y', 'Female_y', 'ESE_y']
X = df[features]  # Features
y = df[['Average Math Score', 'Average Reading Score']]  # Target variables

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Review the dimensions of the datasets
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

X_train shape: (35, 16), y_train shape: (35, 2)
X_test shape: (16, 16), y_test shape: (16, 2)
