In [22]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error,mean_squared_error
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

import warnings

In [2]:
#Read data file
df = pd.read_csv("data/StudentsPerformance.csv")

In [4]:
# Create target variable 
df["Average_score"] = (df["math score"] + df["reading score"] + df["writing score"])/3

In [19]:
#Creating x and y variables
x = df.drop(columns = ["Average_score"],axis = 1)
y = df["Average_score"]

num_cols = [cols for cols in x.columns if df[cols].dtype == "int64"]
cat_cols = [cols for cols in x.columns if df[cols].dtype == "object"]

In [52]:
#Encoding and creating pipeline
num_pipeline = Pipeline(
    steps = [
        ('impute',SimpleImputer( strategy= 'mean')),
        ('scale',MinMaxScaler())
        ]
)

cat_pipeline = Pipeline(
    steps = [
        ('impute',SimpleImputer(strategy = 'most_frequent')),
        ('ohe',OneHotEncoder(handle_unknown = "ignore"))
    ]
)

feat_transformer = ColumnTransformer(
    transformers = [ 
        ('num_transform',num_pipeline,num_cols),
        ('cat_transform',cat_pipeline,cat_cols)
    ]
)

X_transformed = feat_transformer.fit_transform(x)

In [53]:
#train test split
x_train,x_test,y_train,y_test = train_test_split(X_transformed,y,test_size=0.2,random_state=42)

In [55]:
x_train.shape

(800, 17)

In [56]:
#Training model
model = LinearRegression()
model.fit(x_train, y_train)

In [57]:
#prediction

y_train_predicted = model.predict(x_train)
y_test_predicted = model.predict(x_test)

#evaluation
mae = mean_absolute_error(y_test, y_test_predicted)
mse = mean_squared_error(y_test, y_test_predicted)
rmse = np.sqrt(mean_squared_error(y_test, y_test_predicted))
r2_square = r2_score(y_test, y_test_predicted)

In [58]:
print (mae,mse,rmse,r2_square)

10.490182720909907 179.60236164577393 13.40158056520849 0.16217176755435947


In [64]:
#Training model
model = DecisionTreeRegressor()
model.fit(x_train, y_train)

#prediction

y_train_predicted = model.predict(x_train)
y_test_predicted = model.predict(x_test)

#evaluation
mae = mean_absolute_error(y_test, y_test_predicted)
mse = mean_squared_error(y_test, y_test_predicted)
rmse = np.sqrt(mean_squared_error(y_test, y_test_predicted))
r2_square = r2_score(y_test, y_test_predicted)

print (mae,mse,rmse,r2_square)

11.884444684944683 232.652784158266 15.252959849100305 -0.08530349567073015


: 

In [63]:
print(y_test_predicted[:5],"\n=======================\n",y_test[:5])

[66.76803  70.00797  62.04433  56.44541  80.139595] 
 521    87.000000
737    64.000000
740    75.000000
660    74.666667
411    81.666667
Name: Average_score, dtype: float64


In [50]:
x = df.drop(columns = ["Average_score","math score","reading score","writing score"],axis = 1)
y = df["Average_score"]

num_cols = [cols for cols in x.columns if df[cols].dtype == "int64"]
cat_cols = [cols for cols in x.columns if df[cols].dtype == "object"]

[]