In [None]:
pip install category_encoders # type: ignore

In [None]:
pip install shap # type: ignore

In [None]:
pip install xgboost # type: ignore

In [None]:
import pandas as pd
import numpy as np
import math as m
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
import shap
import category_encoders as ce
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.compose import make_column_selector as selector
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


In [None]:
df = pd.read_csv("Expanded_data_with_more_features.csv")
df.head()

In [None]:
df.drop_duplicates()

In [None]:
df.dtypes

In [None]:
df.isna().sum()

In [None]:
# drop nans like talked about when chosing the dataset 
df = df.dropna()
df

In [None]:
df.isnull().sum()

In [None]:
df["avg_total_score"] = round((df["MathScore"] + df["ReadingScore"] + df["WritingScore"])/ 3)
df

In [None]:
df.drop(["Unnamed: 0"], axis=1, inplace=True)

mapping = {'< 5': 5, '5 - 10': 10, '> 10': 15}

# Replace values in the specified column
df["WklyStudyHours"] = df["WklyStudyHours"].replace(mapping)

# Check the result
print(df["WklyStudyHours"].unique())
df["WklyStudyHours"].astype("int")
df["NrSiblings"].astype("int")
df

In [None]:
df.dtypes

In [None]:
# clean the outliers
def detect_outliers_quantile_low(data = df, lower_quantile=0.1, upper_quantile=1):
    lower_bound = data.quantile(lower_quantile)
    upper_bound = data.quantile(upper_quantile)
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers

outliers_total_score = detect_outliers_quantile_low(df["avg_total_score"])
# cleaned df
cleaned_df_low = df[~df["avg_total_score"].isin(outliers_total_score)] # logical operqtor ~ flips values in boolean series
# outlier df
outliers_df_low = df[df["avg_total_score"].isin(outliers_total_score)]

In [None]:
cleaned_df_low

In [None]:
outliers_df_low

In [None]:
def detect_outliers_quantile_high(data = df, lower_quantile=0, upper_quantile=0.95):
    lower_bound = data.quantile(lower_quantile)
    upper_bound = data.quantile(upper_quantile)
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers


outliers_total_score = detect_outliers_quantile_high(df["avg_total_score"])
cleaned_df_high = df[~df["avg_total_score"].isin(outliers_total_score)]
outliers_df_high = df[df["avg_total_score"].isin(outliers_total_score)]

In [None]:
cleaned_df_high

In [None]:
outliers_df_high

In [None]:
def detect_outliers_quantile(data = df, lower_quantile=0.05, upper_quantile=0.95):
    lower_bound = data.quantile(lower_quantile)
    upper_bound = data.quantile(upper_quantile)
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers

outliers_total_score = detect_outliers_quantile(df["avg_total_score"])
cleaned_df = df[~df["avg_total_score"].isin(outliers_total_score)]
outliers_df = df[df["avg_total_score"].isin(outliers_total_score)]

In [None]:
cleaned_df

In [None]:
outliers_df_high

In [None]:
outliers_df_low

In [None]:
outliers_df_low["avg_total_score"].unique()

In [None]:
# safe as csv

cleaned_df.to_csv("cleaned_df", index= False)
outliers_df_high.to_csv("outliers_df_high", index= False)
outliers_df_low.to_csv("outliers_df_low", index= False)

In [None]:
cleaned_df.dtypes

In [None]:

# Separate features and target
X = cleaned_df.drop(['avg_total_score', "MathScore", "ReadingScore", "WritingScore"], axis=1)  # Features
y = cleaned_df['avg_total_score']  # Target

# One-hot encode categorical columns using pandas get_dummies
X_encoded = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=.25, random_state=42)

# Initialize Random Forest Classifier
rf = RandomForestClassifier()

# Fit the model
rf.fit(X_train, y_train)

# Get feature importances
feature_importances = rf.feature_importances_

# Create DataFrame to hold feature names and importances
importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance values
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Print the sorted feature importances
print(importance_df)

In [None]:
global_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
global_importances.sort_values(ascending=True, inplace=True)
global_importances.plot.barh(color='green')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Global Feature Importance - Built-in Method")

In [None]:
X = cleaned_df.drop(['avg_total_score', "MathScore", "ReadingScore", "WritingScore"], axis=1)  # Features
y = cleaned_df['avg_total_score']  # Target

# One-hot encode categorical columns using pandas get_dummies
X_encoded = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=.25, random_state=42)
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

result = permutation_importance(rf, X_test, y_test, n_repeats=10, random_state=42)

perm_importances = result.importances_mean
perm_std = result.importances_std
sorted_idx = perm_importances.argsort()
feature_names = X_test.columns

pd.DataFrame({'Importance': perm_importances, 'Std': perm_std}, index=feature_names[sorted_idx]).sort_values('Importance',ascending=True)


In [None]:
X = outliers_df_low.drop(['avg_total_score', "MathScore", "ReadingScore", "WritingScore"], axis=1)  # Features
y = outliers_df_low['avg_total_score']  # Target

# One-hot encode categorical columns using pandas get_dummies
X_encoded = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=.25, random_state=42)

model = RandomForestClassifier()

model.fit(X_train, y_train)

print(f"model score on training data: {model.score(X_train, y_train)}")
print(f"model score on testing data: {model.score(X_test, y_test)}")

importances = model.feature_importances_

indices = np.argsort(importances)

fig, ax = plt.subplots()
ax.barh(range(len(importances)), importances[indices])
ax.set_yticks(range(len(importances)))
_ = ax.set_yticklabels(np.array(X_train.columns)[indices])



In [None]:
X = outliers_df_high.drop(['avg_total_score', "MathScore", "ReadingScore", "WritingScore"], axis=1)  # Features
y = outliers_df_high['avg_total_score']  # Target

# One-hot encode categorical columns using pandas get_dummies
X_encoded = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=.25, random_state=42)

model = RandomForestClassifier()

model.fit(X_train, y_train)

print(f"model score on training data: {model.score(X_train, y_train)}")
print(f"model score on testing data: {model.score(X_test, y_test)}")

importances = model.feature_importances_

indices = np.argsort(importances)

fig, ax = plt.subplots()
ax.barh(range(len(importances)), importances[indices])
ax.set_yticks(range(len(importances)))
_ = ax.set_yticklabels(np.array(X_train.columns)[indices])

In [None]:
X = cleaned_df.drop(['avg_total_score', "MathScore", "ReadingScore", "WritingScore"], axis=1)  # Features
y = cleaned_df['avg_total_score']  # Target

# One-hot encode categorical columns using pandas get_dummies
X_encoded = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=.25, random_state=42)

model = RandomForestClassifier()

model.fit(X_train, y_train)

print(f"model score on training data: {model.score(X_train, y_train)}")
print(f"model score on testing data: {model.score(X_test, y_test)}")

importances = model.feature_importances_

indices = np.argsort(importances)

fig, ax = plt.subplots()
ax.barh(range(len(importances)), importances[indices])
ax.set_yticks(range(len(importances)))
_ = ax.set_yticklabels(np.array(X_train.columns)[indices])

In [None]:
df_cleaned_no_score = cleaned_df.drop(["ReadingScore","WritingScore", "MathScore"], axis=1)
df_cleaned_no_score

In [None]:
unique_values = df_cleaned_no_score["WklyStudyHours"].unique()
print(unique_values)

In [None]:
X = df_cleaned_no_score.drop('avg_total_score', axis=1)  # Features
y = df_cleaned_no_score['avg_total_score']  # Target

# One-hot encode categorical columns using pandas get_dummies
X_encoded = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=.25, random_state=42)

model = RandomForestClassifier(n_estimators=100)

model.fit(X_train, y_train)

print(f"model score on training data: {model.score(X_train, y_train)}")
print(f"model score on testing data: {model.score(X_test, y_test)}")

importances = model.feature_importances_

indices = np.argsort(importances)

fig, ax = plt.subplots()
ax.barh(range(len(importances)), importances[indices])
ax.set_yticks(range(len(importances)))
_ = ax.set_yticklabels(np.array(X_train.columns)[indices])

In [None]:
X

In [None]:
outliers_ova = pd.concat([outliers_df_high,  outliers_df_low])

In [None]:
outliers_ova

In [None]:
outliers_ova.drop(["ReadingScore","WritingScore", "MathScore"], axis=1, inplace=True)

In [None]:
def classify_student(score):
    if score < 50:
        return 0
    else:
        return 1

In [None]:
outliers_ova["classification"] = outliers_ova["avg_total_score"].apply(classify_student)
outliers_ova

In [None]:
df["classification"] = df["avg_total_score"].apply(classify_student)
df

In [None]:
outliers_ova

In [None]:


X = outliers_ova.drop('classification', axis=1)
y = outliers_ova["classification"]


# Encoding categorical features and scaling numerical features
numerical_features = ['NrSiblings','WklyStudyHours']
categorical_features = ['Gender', 'EthnicGroup', 'ParentEduc', "LunchType", "TestPrep", "ParentMaritalStatus", "PracticeSport", "IsFirstChild", "TransportMeans"]

# Creating transformers
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Column transformer to apply the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model


model = Pipeline(steps=[('preprocessor', preprocessor),
('model', LogisticRegression(class_weight="balanced"))])
    
# Training the model 
model.fit(X_train, y_train)

    
# Predicting 
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)
reporttest = classification_report(y_test, y_pred_test)
reporttrain = classification_report(y_train, y_pred_train)




print("test", reporttest)
print("train", reporttrain)
    


In [None]:
def perform_logistic_regression(data, target_col, drop_col,  test_size=0.2, random_state=42):
    data = data.drop(columns=[drop_col])
    # Define features and target variable


    X = data.drop(columns=[target_col])
    y = (data[target_col])

    # Encoding categorical variables if needed (e.g., using one-hot encoding)
    X = pd.get_dummies(X)

    # Splitting the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize and fit the logistic regression model
    model = LogisticRegression(class_weight="balanced")
    model.fit(X_train, y_train)
    # Predicting 
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    reporttest = classification_report(y_test, y_pred_test)
    reporttrain = classification_report(y_train, y_pred_train)




    print("test", reporttest)
    print("train", reporttrain)

    

In [None]:
perform_logistic_regression(outliers_ova, target_col="classification", drop_col="avg_total_score", test_size= 0.2, random_state= 42)


In [None]:

def confidence(confidence_level):
    n = len(outliers_ova)
    p = outliers_ova["classification"].sum()/n
    t = stats.t.ppf(confidence_level + (1- confidence_level)/2, n-1)
    error = t * m.sqrt(p * (1-p)/n)
    CI = [p - error, p + error]
    return CI


In [None]:
confidence(0.95)

In [None]:


X = df_cleaned_no_score.drop(['avg_total_score'], axis=1)  # Features
y = df_cleaned_no_score['avg_total_score']  # Target

X = pd.get_dummies(X)
le = LabelEncoder()
y = le.fit_transform(y)
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# fit model on all training data



model = XGBClassifier()
model.fit(X_train, y_train)

plot_importance(model)


print(f"model score on training data: {model.score(X_train, y_train)}")
print(f"model score on testing data: {model.score(X_test, y_test)}")

In [None]:

X = outliers_ova.drop(['avg_total_score'], axis=1)
y = outliers_ova['avg_total_score']

X = pd.get_dummies(X)

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# fit model on all training data

le = LabelEncoder()
y_train = le.fit_transform(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data and evaluate
plot_importance(model)


print(f"model score on training data: {model.score(X_train, y_train)}")
print(f"model score on testing data: {model.score(X_test, y_test)}")



In [None]:

X = outliers_df_high.drop(['avg_total_score', "MathScore", "ReadingScore", "WritingScore", "NrSiblings"], axis=1) # drop NrSiblings because I can not explain if more or less siblings are influential yet
y = outliers_df_high['avg_total_score']

X = pd.get_dummies(X)

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# fit model on all training data

le = LabelEncoder()
y_train = le.fit_transform(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data and evaluate
plot_importance(model)


print(f"model score on training data: {model.score(X_train, y_train)}")
print(f"model score on testing data: {model.score(X_test, y_test)}")



In [None]:

X = outliers_df_low.drop(['avg_total_score', "MathScore", "ReadingScore", "WritingScore", "NrSiblings"], axis=1) # drop NrSiblings because I can not explain if more or less siblings are influential yet
y = outliers_df_low['avg_total_score']

X = pd.get_dummies(X)

# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# fit model on all training data

le = LabelEncoder()
y_train = le.fit_transform(y_train)
model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data and evaluate
plot_importance(model)


print(f"model score on training data: {model.score(X_train, y_train)}")
print(f"model score on testing data: {model.score(X_test, y_test)}")