In [101]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDRegressor

In [102]:
# Load the data into a pandas DataFrame
data1 = pd.read_csv('student-mat.csv', delimiter=';')
data2 = pd.read_csv('student-por.csv', delimiter=';')

In [103]:
data1.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [104]:
data2.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13


In [105]:
data1.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

In [106]:
data2.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

In [107]:
# Concatenate the datasets vertically
combined_data = pd.concat([data1, data2], axis=0)

# Verify the column names in the combined dataset
print(combined_data.columns)

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')


In [108]:
X = combined_data.drop('G3', axis=1)  # Drop the target variable 'G3'
y = combined_data['G3']


In [109]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [110]:
# Apply one-hot encoding to the categorical variables in the training dataset
X_train_encoded = pd.get_dummies(X_train)

# Apply the same one-hot encoding to the categorical variables in the test dataset
X_test_encoded = pd.get_dummies(X_test)

# Make sure the test dataset has the same columns as the training dataset
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)


**Linear Regression**

In [111]:
linear_reg = LinearRegression()
linear_reg.fit(X_train_encoded, y_train)
linear_reg_predictions = linear_reg.predict(X_test_encoded)
linear_reg_mse = mean_squared_error(y_test, linear_reg_predictions)
print("Linear Regression Predictions:", linear_reg_predictions)
print("Linear Regression MSE:", linear_reg_mse)

Linear Regression Predictions: [10.49462891  8.13037109 14.25244141 11.0012207  12.06787109 12.17724609
  5.48291016 10.43652344 14.45263672 13.27148438 15.3203125   6.85449219
  6.9934082  11.23632812  7.72314453 11.77880859  8.59863281 16.96679688
 11.20239258 11.94580078  8.89453125 16.59716797 11.17578125  0.52539062
 16.2890625  11.29858398 17.85009766 16.47192383 11.48974609  9.16748047
 10.30053711  9.30712891 18.95068359 10.82128906 15.40429688 13.84960938
 15.17724609 18.63256836  9.59423828 13.1784668  15.87890625 16.70849609
 11.49267578  8.47387695 14.60229492  9.74414062 10.92529297 16.91064453
 12.86523438 11.17578125 11.16967773 14.70507812 10.07763672 10.1875
  6.23046875 10.48779297 10.89282227 13.6105957  17.45141602  7.15917969
 10.28491211  8.10058594  7.58837891 12.71728516  6.52172852  9.29833984
  8.57250977  6.15136719  9.57910156 19.26708984 11.74121094 14.78173828
  8.31860352  8.60424805  8.28491211  9.3828125   6.18847656  7.91357422
  7.38183594 13.0480957 

**Polynomial Regression**

In [112]:
poly_features = PolynomialFeatures(degree=2)
X_poly = poly_features.fit_transform(X_train_encoded)
poly_reg = LinearRegression()
poly_reg.fit(X_poly, y_train)
X_test_poly = poly_features.transform(X_test_encoded)
poly_reg_predictions = poly_reg.predict(X_test_poly)
poly_reg_mse = mean_squared_error(y_test, poly_reg_predictions)
print("Polynomial Regression Predictions:", poly_reg_predictions)
print("Polynomial Regression MSE:", poly_reg_mse)

Polynomial Regression Predictions: [-1.45804281e+00  1.52488913e+00 -6.25086342e+01  9.06101222e+01
  8.59636775e+00 -3.60199766e+01 -3.78863640e+01  1.18476004e+01
  4.85838625e+01  1.36864882e+01  2.07268672e+01  9.83284170e+00
 -5.11628184e+01  1.58031532e+01  6.87073588e+00 -2.08791689e+01
  5.26864907e+00  4.88150185e+01  1.12427694e+02  1.88852969e+01
  4.17240808e+01  1.71869425e+01  2.05748796e+01 -2.97495923e-01
  5.47805021e+00 -2.07251721e+01  1.11913186e+01  1.88914303e+01
  3.63025001e+00  9.47718209e+00  3.34746915e+01 -3.52213242e+01
  1.48273984e+01  9.74968263e+01  1.57769187e+01  2.42756664e+01
  9.69315836e+01  7.23786950e+00  9.12779667e-01 -2.75831130e+01
  1.17040566e+01  1.73109635e+01 -7.69289246e+00  6.09651006e+00
  1.29587235e+01 -4.20976640e+00  1.00174040e+01  1.29607984e+01
  6.01538046e+01  1.73426838e+01  1.60259181e+01  7.74730311e+00
  2.54693260e+01  7.01173284e+01  9.05836645e+00  9.95064693e+00
  4.48963906e+00  1.13432762e+01  1.80535350e+01  4.214

**Ridge Regression**

In [113]:
ridge_reg = Ridge(alpha=0.5)
ridge_reg.fit(X_train_encoded, y_train)
ridge_reg_predictions = ridge_reg.predict(X_test_encoded)
ridge_reg_mse = mean_squared_error(y_test, ridge_reg_predictions)
print("Ridge Regression Predictions:", ridge_reg_predictions)
print("Ridge Regression MSE:", ridge_reg_mse)

Ridge Regression Predictions: [10.49234764  8.13295045 14.25133845 11.00162501 12.06864439 12.1777405
  5.48485521 10.43981796 14.45344391 13.26995529 15.31900984  6.85404911
  6.99347984 11.23397215  7.722157   11.78152305  8.60066471 16.96603653
 11.1998913  11.94580564  8.89434598 16.59593525 11.17661778  0.52293948
 16.28755254 11.29940699 17.85019294 16.47290348 11.48727918  9.16719753
 10.29676959  9.31006659 18.94890845 10.82029979 15.40487858 13.85182856
 15.17490925 18.63334226  9.59333676 13.17977716 15.87719813 16.71279853
 11.4915074   8.47363747 14.60447895  9.74362787 10.9261497  16.90857716
 12.86575385 11.17563955 11.17018042 14.70753087 10.08030913 10.18159293
  6.23581008 10.48759089 10.89352321 13.60973848 17.45219625  7.16210731
 10.28407169  8.09732396  7.58504444 12.71513322  6.52438839  9.29886438
  8.57545626  6.15358697  9.57835523 19.268011   11.74034213 14.77710488
  8.32187685  8.59975288  8.28768901  9.38222539  6.18879693  7.91298109
  7.38203128 13.051971

**Lasso Regression**

In [114]:
lasso_reg = Lasso(alpha=0.5)
lasso_reg.fit(X_train_encoded, y_train)
lasso_reg_predictions = lasso_reg.predict(X_test_encoded)
lasso_reg_mse = mean_squared_error(y_test, lasso_reg_predictions)
print("Lasso Regression Predictions:", lasso_reg_predictions)
print("Lasso Regression MSE:", lasso_reg_mse)

Lasso Regression Predictions: [ 9.97815366  8.18025262 13.26429755 11.00026423 12.11027863 12.0223748
  6.03021019 11.09696896 14.34801446 13.17639373 15.44053226  6.76220657
  7.0964305  10.13635951  7.9338273  12.23338608  9.17575005 16.40983741
 11.29917933 11.93447098  9.14065165 16.40983741 10.95625971  0.26665866
 15.48443157 11.2024746  17.46725682 16.30433177 10.96506061  9.07024441
  9.94295004 10.01325206 18.31345452 10.9035595  15.25581849 14.05790026
 14.85139775 18.4806717   9.90774642 13.05328628 15.15031285 17.24733942
 10.97386152  8.7537275  15.04480721  9.90774642 10.99146333 16.21642795
 12.39169714 11.13217258 11.16727098 15.40532865 10.1275586   9.80224079
  6.88531401 10.99146333 10.95625971 13.17639373 17.24733942  7.72281603
 11.79376172  8.12723676  7.09632529 12.21578427  7.03482417  9.07034963
  9.90774642  6.89422014  9.08784622 18.61258005 11.2024746  14.22490701
  9.03514601  8.09213836  8.96473878  9.07024441  6.92051763  7.91633071
  7.00842146 13.449011

**Decision Tree Regression**

In [115]:
dt_reg = DecisionTreeRegressor()
dt_reg.fit(X_train_encoded, y_train)
dt_reg_predictions = dt_reg.predict(X_test_encoded)
dt_reg_mse = mean_squared_error(y_test, dt_reg_predictions)
print("Decision Tree Regression Predictions:", dt_reg_predictions)
print("Decision Tree Regression MSE:", dt_reg_mse)

Decision Tree Regression Predictions: [11.  7. 14. 11. 13. 11.  7. 11. 15. 14. 16.  8.  8. 10.  9. 12. 10. 15.
 11. 12. 11. 15. 14.  0. 14. 12. 18. 16. 11. 10. 11. 10. 18. 11. 15. 14.
 15. 19. 12. 12. 15. 15. 11.  7. 17. 12. 11. 16. 12. 12. 11. 16. 10. 10.
  8. 10. 11. 14. 18.  8. 11. 10.  7. 14.  8.  9. 10.  7.  9. 17. 11. 14.
  9.  9.  0. 10.  8. 10.  8. 14. 12. 11. 11.  0. 17. 15.  8.  9.  9. 11.
 15.  0. 10. 13. 11. 12. 11.  8. 12. 11. 15. 14. 12. 13. 11. 14. 19. 10.
 10. 12. 14.  9. 19. 15. 10. 14. 14. 11. 11.  0. 10. 10. 12. 10. 15. 11.
 10. 14. 13.  8.  8.  8. 13. 11.  0.  9.  8. 18. 16. 13. 11.  8.  9.  9.
 13. 14. 11. 14.  9.  5. 11. 15. 12.  8. 12. 12. 10. 13. 11. 10. 10. 10.
 12. 12. 12. 12. 10. 15. 11. 15. 12. 13. 11. 16. 14. 15. 17. 13. 11.  9.
  8.  0.  8.  9.  8. 10. 10. 15. 10.  8.  8. 11. 10. 12. 10. 12.  0. 10.
 12. 14.  0. 15.  0. 12. 11.  0. 14. 13. 10.]
Decision Tree Regression MSE: 4.899521531100478


**Random Forest Regression**

In [116]:
rf_reg = RandomForestRegressor()
rf_reg.fit(X_train_encoded, y_train)
rf_reg_predictions = rf_reg.predict(X_test_encoded)
rf_reg_mse = mean_squared_error(y_test, rf_reg_predictions)
print("Random Forest Regression Predictions:", rf_reg_predictions)
print("Random Forest Regression MSE:", rf_reg_mse)

Random Forest Regression Predictions: [10.6         7.44       13.08       11.12       12.95       12.08
  2.75       10.96       14.69       13.49       15.71833333  5.4
  7.79       11.01        7.76       12.29        9.55       15.66
 11.21       12.59       10.02       16.18       11.19        1.28
 15.465      11.31       17.67       16.51       10.96        9.62
  9.93       10.02       17.96       11.         15.25       14.6
 15.41       18.28       10.18       13.31       15.14       16.69
 10.93        8.48       16.04       10.54       10.93       16.25
 12.13       11.03       11.34       15.4325     10.24       10.06
  7.47       10.83       11.01       12.96       17.73        8.22
 11.9         5.07        7.68       12.33        8.          9.11
  6.66        6.68       10.2        18.15       11.24       14.32
  9.4         8.39        8.95        9.74        7.33        7.65
  7.28       13.69       13.09       12.66       10.6         9.2
 16.13       14.59        7

**Gradient Boosting Regression**

In [117]:
gb_reg = GradientBoostingRegressor()
gb_reg.fit(X_train_encoded, y_train)
gb_reg_predictions = gb_reg.predict(X_test_encoded)
gb_reg_mse = mean_squared_error(y_test, gb_reg_predictions)
print("Gradient Boosting Regression Predictions:", gb_reg_predictions)
print("Gradient Boosting Regression MSE:", gb_reg_mse)

Gradient Boosting Regression Predictions: [10.50870021  7.27435314 14.11357026 11.19306692 12.59864872 12.30090615
  2.98202669 10.88455636 14.62258798 13.2464778  15.92306159  4.28154516
  7.68016373 11.19206458  8.40011676 12.78160153  9.32457177 16.06573295
 11.29319526 12.36104183  9.92024263 16.24699759 10.81100885 -0.26115789
 15.66718553 11.40380525 17.1208449  16.2547722  11.29601782  9.0259701
 10.26316334  9.81098258 17.88751403 10.61525614 15.47573011 14.42689238
 14.97019118 17.94964325  9.95535722 13.23782759 15.41031898 17.25287526
 11.10001761  7.72259684 15.13782945 10.22254651 10.94284885 16.41694486
 12.86993319 11.27097186 11.48487218 15.15432572 10.34292173  9.57312278
  7.17909644 10.98083095 11.03078573 13.27267011 17.24755897  7.88215378
 10.52879653  7.16189941  7.78146704 12.43904369  7.9045633   9.3507848
  8.57645471  5.92052705 10.46962538 17.67053958 11.66818078 14.63222285
  9.1527157   8.61429288  7.78181549  8.75116389  7.5675279   7.68200971
  8.1646661

**Support Vector Regression**

In [118]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)
svr_reg = SVR()
svr_reg.fit(X_train_scaled, y_train)
svr_reg_predictions = svr_reg.predict(X_test_scaled)
svr_reg_mse = mean_squared_error(y_test, svr_reg_predictions)
print("Support Vector Regression Predictions:", svr_reg_predictions)
print("Support Vector Regression MSE:", svr_reg_mse)

Support Vector Regression Predictions: [10.92569081  8.97324779 13.56625423 11.42417261 12.5734841  11.6319503
  8.16496279 11.40799627 14.64188952 13.97841916 16.0889915   8.37960379
  8.49544458 10.71015711  9.18019196 10.74988321  8.85358859 14.78871005
 11.51788307 11.82919265  9.84192996 16.60342085 10.99501642  6.98093383
 14.6182564  12.04632518 15.48243407 16.46411877 11.49169384  9.89065073
 10.1236309  10.01114923 17.56598684 10.3310695  14.97555143 13.44142747
 12.55618723 15.49732448  9.85745212 13.38959389 14.49676417 16.22323859
 11.5931361   9.61179686 14.02450821  9.66221734 10.89233626 16.93715775
 12.62326245 11.07322191 11.88597934 14.49171788 11.35669702 10.60798368
  9.157707   10.45622169 10.99556178 12.92649218 16.77308416  8.03902779
 10.59329398 10.51557446  8.87256252 12.73896038  7.91836505  9.89702181
  9.75394958  6.14710914 10.22501528 15.87726763 12.0697478  14.46011875
  8.91272328  9.88550171 10.25821514  9.90809235  7.7268466  10.05375187
  7.77786693 

**Gradient Descent**

In [119]:
gd_reg = make_pipeline(StandardScaler(), SGDRegressor())
gd_reg.fit(X_train_encoded, y_train)
gd_reg_predictions = gd_reg.predict(X_test_encoded)
gd_reg_mse = mean_squared_error(y_test, gd_reg_predictions)
print("Gradient Descent Regression Predictions:", gd_reg_predictions)
print("Gradient Descent Regression MSE:", gd_reg_mse)

Gradient Descent Regression Predictions: [10.67485836  8.27899454 14.20890901 11.00566263 12.10562729 12.15555233
  5.59907571 10.385965   14.49292536 13.08984225 15.32655463  6.80484418
  6.91327302 11.34947788  7.61050018 11.93768294  8.67563471 17.01299545
 11.2874188  11.88510273  8.91250051 16.53327245 11.18077465  0.61881111
 16.35769265 11.25146806 17.95582258 16.36214926 11.4390537   9.13436636
 10.35986106  9.31514797 19.00504956 10.71504538 15.46638793 13.80245742
 15.06434027 18.67365233  9.52081172 13.23278846 15.8923308  16.69587261
 11.5198245   8.31249298 14.48800909  9.739249   10.83713615 16.80578258
 12.9236752  11.06455112 11.23446248 14.73506825 10.1012861  10.08288409
  6.08641846 10.53940926 10.85513916 13.65914909 17.48396334  7.06681261
 10.16170924  7.93212569  7.6804002  12.79615757  6.57931493  9.32309664
  8.54846393  6.16900372  9.60348475 19.32043703 11.66017443 14.62843059
  8.20735219  8.64443871  8.12044453  9.45631111  6.10105276  8.04873234
  7.276854

**Performance** (A lower MSE indicates better performance)

In [120]:
print("Linear Regression MSE:", linear_reg_mse)
print("Polynomial Regression MSE:", poly_reg_mse)
print("Ridge Regression MSE:", ridge_reg_mse)
print("Lasso Regression MSE:", lasso_reg_mse)
print("Decision Tree Regression MSE:", dt_reg_mse)
print("Random Forest Regression MSE:", rf_reg_mse)
print("Gradient Boosting Regression MSE:", gb_reg_mse)
print("Support Vector Regression MSE:", svr_reg_mse)
print("Gradient Descent Regression MSE:", gd_reg_mse)

Linear Regression MSE: 3.2036108508634795
Polynomial Regression MSE: 1192.909886384272
Ridge Regression MSE: 3.2017082760194695
Lasso Regression MSE: 2.98522561693667
Decision Tree Regression MSE: 4.899521531100478
Random Forest Regression MSE: 2.891993879585327
Gradient Boosting Regression MSE: 2.5608459088329867
Support Vector Regression MSE: 4.964273428122541
Gradient Descent Regression MSE: 3.2465311840200615


In [121]:
# Initialize the best MSE variable
best_mse = float('inf') # It should be an initial value that is higher than the maximum possible MSE to ensure that the first model's MSE is assigned correctly.
best_model = None

# Compare the MSE values and find the best model
if linear_reg_mse < best_mse:
    best_mse = linear_reg_mse
    best_model = "Linear Regression"

if poly_reg_mse < best_mse:
    best_mse = poly_reg_mse
    best_model = "Polynomial Regression"

if ridge_reg_mse < best_mse:
    best_mse = ridge_reg_mse
    best_model = "Ridge Regression"

if lasso_reg_mse < best_mse:
    best_mse = lasso_reg_mse
    best_model = "Lasso Regression"

if dt_reg_mse < best_mse:
    best_mse = dt_reg_mse
    best_model = "Decision Tree Regression"

if rf_reg_mse < best_mse:
    best_mse = rf_reg_mse
    best_model = "Random Forest Regression"

if gb_reg_mse < best_mse:
    best_mse = gb_reg_mse
    best_model = "Gradient Boosting Regression"

if svr_reg_mse < best_mse:
    best_mse = svr_reg_mse
    best_model = "Support Vector Regression"

if gd_reg_mse < best_mse:
    best_mse = gd_reg_mse
    best_model = "Gradient Descent Regression"

print("The best model is:", best_model)


The best model is: Gradient Boosting Regression
