In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import os

In [2]:
os.chdir("dataset")

In [3]:
df = pd.read_csv('infrared_thermography_temperature.csv')
df.head()

Unnamed: 0,Gender,Age,Ethnicity,T_atm,Humidity,Max1R13_1,Max1L13_1,aveAllR13_1,aveAllL13_1,T_LC1,RCC1,LCC1,canthiMax1,canthi4Max1,T_FHRC1
0,Male,41-50,White,24.0,28.0,35.03,35.3775,34.4,34.9175,35.3375,34.785,34.465,35.3775,35.3375,33.4775
1,Female,31-40,Black or African-American,24.0,26.0,34.55,34.52,33.93,34.225,34.56,34.3225,34.24,34.74,34.715,34.055
2,Female,21-30,White,24.0,26.0,35.6525,35.5175,34.2775,34.8,35.5025,35.3575,35.0925,35.7175,35.6825,34.8275
3,Female,21-30,Black or African-American,24.0,27.0,35.2225,35.6125,34.385,35.2475,35.595,34.91,35.17,35.6125,35.595,34.4225
4,Male,18-20,White,24.0,27.0,35.545,35.665,34.91,35.3675,35.64,35.355,35.12,35.665,35.6475,35.16


In [14]:
X = df.drop(columns=['Gender','Age','Ethnicity','T_atm'],axis=1)
y = df['T_atm']

In [15]:
X.shape

(1020, 11)

In [16]:
X.corr()

Unnamed: 0,Humidity,Max1R13_1,Max1L13_1,aveAllR13_1,aveAllL13_1,T_LC1,RCC1,LCC1,canthiMax1,canthi4Max1,T_FHRC1
Humidity,1.0,0.050884,0.060754,0.101012,0.112754,0.064622,0.102132,0.119523,0.051423,0.052275,0.033311
Max1R13_1,0.050884,1.0,0.901213,0.868731,0.825698,0.902975,0.950207,0.848891,0.95585,0.956726,0.665614
Max1L13_1,0.060754,0.901213,1.0,0.775313,0.89637,0.988479,0.87814,0.938003,0.95189,0.952341,0.640706
aveAllR13_1,0.101012,0.868731,0.775313,1.0,0.858994,0.769817,0.851173,0.759324,0.811333,0.812102,0.609243
aveAllL13_1,0.112754,0.825698,0.89637,0.858994,1.0,0.878311,0.82475,0.874104,0.849653,0.850296,0.620341
T_LC1,0.064622,0.902975,0.988479,0.769817,0.878311,1.0,0.874455,0.934135,0.963246,0.963761,0.655256
RCC1,0.102132,0.950207,0.87814,0.851173,0.82475,0.874455,1.0,0.867935,0.918818,0.920075,0.652436
LCC1,0.119523,0.848891,0.938003,0.759324,0.874104,0.934135,0.867935,1.0,0.896997,0.897871,0.633967
canthiMax1,0.051423,0.95585,0.95189,0.811333,0.849653,0.963246,0.918818,0.896997,1.0,0.999786,0.662722
canthi4Max1,0.052275,0.956726,0.952341,0.812102,0.850296,0.963761,0.920075,0.897871,0.999786,1.0,0.664494


In [44]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [47]:
model = LinearRegression()
model.fit(X_train,y_train)

In [48]:
y_pred = model.predict(X_test)
print('Accuracy',r2_score(y_test,y_pred))

Accuracy 0.22566778320751624


#### Using ElasticNet which might perform better bcz it lesser the useless features

In [34]:
from sklearn.linear_model import ElasticNet

In [51]:
model = ElasticNet()
model.fit(X_train,y_train)

In [52]:
y_pred = model.predict(X_test)
print('Accuracy',r2_score(y_test,y_pred))

Accuracy 0.01926418638222338


#### Using Gradient Decent for better accuracy

In [37]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor

In [41]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(X_train_scaled)

[[ 0.70806488  0.50769016  1.3274743  ...  1.16509468  1.14865815
   1.31584313]
 [ 1.65808247  0.94067006  0.97454791 ...  0.77253418  0.78508473
   1.32706022]
 [ 1.4756791   0.94067006  1.05349197 ...  0.89700458  0.90946511
   1.70470209]
 ...
 [-0.81196327  0.05284259  0.23154497 ...  0.16933148  0.16318281
   0.56429841]
 [ 0.54846192  0.37211059  0.80272848 ...  0.49486945  0.51240466
   1.6673118 ]
 [-0.78156271 -0.27954602  0.07830061 ... -0.1370572  -0.15733587
  -0.24707076]]


In [55]:
sgd_regressor = SGDRegressor(max_iter=1000, tol=1e-3)
sgd_regressor.fit(X_train_scaled, y_train)

In [58]:
y_pred = sgd_regressor.predict(X_test_scaled)
print(f'R2 Score: {sgd_regressor.score(X_test_scaled, y_test)}')

R2 Score: 0.21253664672448236


#### The Linear Model is not showing any good result on this dataset so lets move to Non-Linear Models

In [59]:
from sklearn.ensemble import RandomForestRegressor

In [60]:
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

In [61]:
y_pred_rf = rf_regressor.predict(X_test)

r2_rf = rf_regressor.score(X_test, y_test)
print(f'R2 Score (RandomForest): {r2_rf}')

R^2 Score (RandomForest): 0.21710080501112483


#### Accuracy are not getter better so there might be some issue with the data and need to use complex models bcz linear models are failing to work on this dataset