# CO2 prediction model

#### Read in data and imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from math import sqrt
from sklearn.metrics import average_precision_score
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

In [2]:
data =  pd.read_csv("data/cleanedData.csv")

def get_best_alpha(model,X,y,alphas):
    kf = KFold(n_splits=3, random_state = 0, shuffle=False)
    results = []
    for alpha in alphas:
        model.set_params(alpha=alpha)
        for train_indexes, test_indexes in kf.split(X):
            X_test, X_val = X.values[train_indexes], X.values[test_indexes]
            y_test, y_val = y.values[train_indexes], y.values[test_indexes]

            model.fit(X_test, y_test)
            mse = mean_squared_error(y_val,model.predict(X_val))
            results.append((alpha, mse))
   

    best_alpha, mse = sorted(results, key=lambda x: x[1])[0]
    return best_alpha, mse

#### Need to remove vehicles without CO2 emissions listed

In [3]:
df_NoCO2 = data[data.CO2.isna()]
df_CO2 = data[data.CO2.notnull()]
#df_CO2 = df_CO2[df_CO2['Käigukasti tüüp'] != np.nan]


#### Make all features numeric

In [4]:
#df_CO2 = np.array_split(df_CO2, 2)[0]

In [5]:
y = df_CO2['CO2']
X = df_CO2.drop(columns=['CO2'])

for (columnName, columnData) in X.iteritems():
    print('Colunm Name : ', columnName)
    uniqueValues = X[columnName].unique()
    print(uniqueValues)
    if ((X[columnName].dtypes == 'object')):
        v = 1
        for value in uniqueValues:
            X.loc[(X[columnName] == value), columnName] = v
            v += 1
        X[columnName] = X[columnName].fillna(0)

for (columnName, columnData) in X.iteritems():
    print('Colunm Name : ', columnName)
    print(X[columnName].unique())
    
#X = pd.get_dummies(X)

Colunm Name :  Üldine staatus
['REGISTREERITUD' 'PEATATUD']
Colunm Name :  Kategooria
['M1' 'M1G']
Colunm Name :  Mark
['ALFA ROMEO' 'ALPINA' 'ALPINE' 'AMG HUMMER' 'ASTON MARTIN' 'AUDI'
 'BENTLEY' 'BMW' 'BUICK' 'CADILLAC' 'CHEVROLET' 'CHRYSLER' 'CITROEN'
 'DACIA' 'DAEWOO' 'DAIHATSU' 'DODGE' 'DONKERVOORT' 'DR MOTOR' 'DS'
 'FERRARI' 'FIAT' 'FISKER' 'FORD' 'GMC' 'HONDA' 'HYUNDAI' 'INFINITI'
 'ISUZU' 'IVECO' 'JAGUAR' 'JEEP' 'KIA' 'LADA' 'LAMBORGHINI' 'LANCIA'
 'LAND ROVER' 'LES DAUPHINS' 'LEXUS' 'LINCOLN' 'LOTUS' 'MAN' 'MASERATI'
 'MAZDA' 'MCC' 'MCLAREN' 'MERCEDES-BENZ' 'MG' 'MINI' 'MITSUBISHI' 'MORGAN'
 'NISSAN' 'OPEL' 'PEUGEOT' 'PLYMOUTH' 'PONTIAC' 'PORSCHE' 'RENAULT'
 'ROLLS-ROYCE' 'ROVER' 'SAAB' 'SEAT' 'SHUANGHUAN' 'ŠKODA' 'SMART'
 'SSANGYONG' 'SUBARU' 'SUZUKI' 'TOYOTA' 'VAZ' 'VAUXHALL' 'VOLKSWAGEN'
 'VOLVO']
Colunm Name :  Mudel
['145' '146' '147' ... 'XC90' 'XC90 T8 TWIN ENGINE'
 'NILSSON V70 AMBULANCE']
Colunm Name :  Keretüüp
['LUUKPÄRA' 'SEDAAN' 'UNIVERSAAL' 'LAHTINE' 'KUPEE' 'MAH

#### Split the data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#### Classifiers

In [7]:
model = RandomForestClassifier(random_state = 0, n_estimators = 10).fit(X_train, y_train)
#model = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
lr = LinearRegression().fit(X_train, y_train)
ridge = Ridge().fit(X_train, y_train)
lasso = Lasso().fit(X_train, y_train)

In [8]:
print("LinearRegression:",mean_squared_error(y_test, lr.predict(X_test), squared=False))
print("Ridge:",mean_squared_error(y_test, ridge.predict(X_test), squared=False))
print("Lasso:",mean_squared_error(y_test, lasso.predict(X_test), squared=False))

LinearRegression: 17.983926778097587
Ridge: 17.98392673284698
Lasso: 18.384037940826516


In [9]:
predResults = model.predict(X_test)

In [10]:
acc = accuracy_score(y_test, predResults)
print(model , "accuracy:", acc)

RandomForestClassifier(n_estimators=10, random_state=0) accuracy: 0.8705281945051716


#### Getting better alphas

In [11]:
ridge_alphas = np.linspace(0.1, 10, 100)
lasso_alphas = np.linspace(0.001, 5, 100)

print("Ridge best alpha %.4f - Avg MSE %.4f " % get_best_alpha(Ridge(), X_train, y_train, ridge_alphas))
print("Lasso best alpha %.4f - Avg MSE %.4f " % get_best_alpha(Lasso(), X_train, y_train, lasso_alphas))



Ridge best alpha 0.1000 - Avg MSE 319.3770 




Lasso best alpha 0.0010 - Avg MSE 319.3771 


In [12]:
ridge2 = Ridge(alpha=0.1).fit(X_train, y_train)
lasso2 = Lasso(alpha=0.0010).fit(X_train, y_train)

In [13]:
print("Ridge:",mean_squared_error(y_test, ridge.predict(X_test), squared=False))
print("Lasso:",mean_squared_error(y_test, lasso.predict(X_test), squared=False))

Ridge: 17.98392673284698
Lasso: 18.384037940826516


#### Finding column coefficients

In [14]:
for i in range(len(lr.coef_)):
    print(X.columns[i] + " coefficient:", lr.coef_[i])

Üldine staatus coefficient: 1.5022127997176844
Kategooria coefficient: 12.635263438280843
Mark coefficient: -0.19492253923255098
Mudel coefficient: 0.007229411864185479
Keretüüp coefficient: 1.4205927062796402
Esm reg aasta coefficient: -3.7030264281127
Värv coefficient: 0.05471613835612219
Mootori tüüp coefficient: -22.466474617050075
Mootori maht coefficient: 0.024940022414884577
Mootori võimsus coefficient: 0.010033049607092623
Tühimass coefficient: 0.05336504009924912
Käigukasti tüüp coefficient: 1.8416570638356138
Telgi kokku coefficient: -1.0658141036401503e-14
Maakond coefficient: -0.03769371105952954
Linn coefficient: 0.017976456103185132


In [15]:
for i in range(len(ridge2.coef_)):
    print(X.columns[i] + " coefficient:", ridge2.coef_[i])

Üldine staatus coefficient: 1.5021979198774411
Kategooria coefficient: 12.635199835550075
Mark coefficient: -0.19492282201175237
Mudel coefficient: 0.007229422799820544
Keretüüp coefficient: 1.420591110682824
Esm reg aasta coefficient: -3.7030271127453322
Värv coefficient: 0.05471626182141867
Mootori tüüp coefficient: -22.466456899336407
Mootori maht coefficient: 0.02494001225192467
Mootori võimsus coefficient: 0.01003321472727654
Tühimass coefficient: 0.053365061938720315
Käigukasti tüüp coefficient: 1.8416542284238966
Telgi kokku coefficient: 0.0
Maakond coefficient: -0.03769387079261876
Linn coefficient: 0.017976470966754952


In [16]:
for i in range(len(lasso2.coef_)):
    print(X.columns[i] + " coefficient:", lasso2.coef_[i])

Üldine staatus coefficient: 1.4717065756462226
Kategooria coefficient: 12.61790267470762
Mark coefficient: -0.1949777234365671
Mudel coefficient: 0.007231749457370385
Keretüüp coefficient: 1.419613502909684
Esm reg aasta coefficient: -3.703173282516459
Värv coefficient: 0.05469864618450196
Mootori tüüp coefficient: -22.464132744723365
Mootori maht coefficient: 0.024939964143526634
Mootori võimsus coefficient: 0.010036696922641138
Tühimass coefficient: 0.053374534326742155
Käigukasti tüüp coefficient: 1.839382877679788
Telgi kokku coefficient: 0.0
Maakond coefficient: -0.037757868001531095
Linn coefficient: 0.01784361060219596
