In [30]:
import pandas as pd
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , OneHotEncoder , OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [6]:
df = pd.read_csv('StudentScore.xls')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [7]:
profile = ProfileReport(df , title = "Pandas Profiling Report", explorative=True)
profile.to_file("output.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 8/8 [00:00<00:00, 245.73it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [8]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [10]:
for i in df.columns:
    count = df[i].value_counts()
    plt.figure(figsize=(10, 5))
    count.plot(kind='bar')
    plt.title(f'Value Counts for {i}')
    plt.xlabel(i)
    plt.ylabel('Count')
    plt.show()

  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()


Spliting Data

In [11]:
target = "writing score"
x = df.drop(target , axis = 1)
y = df[target]
print(x.shape, y.shape)

(1000, 7) (1000,)


In [12]:
x_train , x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

Transform

In [13]:
#transform numerical data

num_col = ['math score', 'reading score']

num_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [14]:
#transform Ordinal data

education_values = ["some high school", "high school", "some college", "associate's degree", "bachelor's degree", "master's degree"]
lunch_values = x_train['lunch'].unique()
gender_values = x_train['gender'].unique()
test_values = x_train['test preparation course'].unique()

ord_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', OrdinalEncoder(categories = [education_values, 
                                            lunch_values,
                                            gender_values,
                                            test_values]))
])

In [15]:
#transform nominal data

nom_col = ["race/ethnicity"]

nom_transform = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', OneHotEncoder(sparse_output=False))
])

Apply ColumnsTransformer

In [17]:
preprocessor = ColumnTransformer(transformers=[
        ('num', num_transform, num_col),
        ('ord', ord_transform, ['parental level of education', 'lunch', 'gender' , 'test preparation course']),
        ('nom', nom_transform, nom_col)
])

In [18]:
preprocessor

Modeling

In [20]:
reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

In [21]:
reg.fit(x_train, y_train)

In [25]:
y_pred = reg.predict(x_test)
y_pred

array([88.67289105, 66.50404492, 73.2173696 , 70.90618711, 79.73674645,
       73.7679599 , 68.57747587, 62.50836852, 72.78169688, 51.17543341,
       41.84060229, 21.86423508, 79.0260469 , 63.1143721 , 82.08157849,
       78.80232725, 49.57508298, 47.9645953 , 57.23114693, 65.84746888,
       71.53171194, 55.75544106, 69.30177764, 49.37376635, 77.58272974,
       73.83057686, 73.93932607, 56.18895737, 48.74782845, 58.55020878,
       56.75959134, 64.95659712, 59.0909488 , 66.00751226, 72.25366067,
       52.1227874 , 74.55702568, 77.12804116, 79.58686424, 14.70704909,
       76.55230208, 63.08422813, 65.17127171, 61.83746112, 84.21799935,
       64.60518271, 67.08390706, 32.03731704, 86.43106722, 83.23278292,
       72.08462324, 75.70810196, 77.59430284, 58.52505634, 71.70554635,
       75.17197117, 78.81149468, 52.28684105, 80.39276703, 90.67564071,
       41.45468775, 82.67788287, 79.88717053, 61.13173774, 88.72416429,
       77.46225191, 68.77701841, 50.19052195, 68.0070307 , 93.34

In [28]:
for i , x in zip(y_pred , y_test):
    print("Predicted: {} - Actual: {}".format(i, x))

Predicted: 88.67289105107454 - Actual: 84
Predicted: 66.50404491711782 - Actual: 73
Predicted: 73.21736959975686 - Actual: 72
Predicted: 70.9061871053156 - Actual: 73
Predicted: 79.73674644539479 - Actual: 78
Predicted: 73.76795989709788 - Actual: 78
Predicted: 68.57747587046838 - Actual: 63
Predicted: 62.508368521932724 - Actual: 62
Predicted: 72.78169688316078 - Actual: 72
Predicted: 51.17543340882495 - Actual: 41
Predicted: 41.84060228989462 - Actual: 49
Predicted: 21.86423507932797 - Actual: 22
Predicted: 79.02604690031274 - Actual: 81
Predicted: 63.11437209786499 - Actual: 61
Predicted: 82.08157849466899 - Actual: 85
Predicted: 78.80232725357843 - Actual: 82
Predicted: 49.575082984729335 - Actual: 42
Predicted: 47.96459530303818 - Actual: 44
Predicted: 57.23114693308114 - Actual: 54
Predicted: 65.84746887782158 - Actual: 63
Predicted: 71.53171194494661 - Actual: 69
Predicted: 55.755441059963694 - Actual: 54
Predicted: 69.30177764338508 - Actual: 70
Predicted: 49.37376634856032 - A

In [31]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)

Mean Absolute Error: 3.174564343542829
Mean Squared Error: 15.393227002335799
