# CO2 prediction model

#### Read in data and imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from math import sqrt
from sklearn.metrics import average_precision_score
from sklearn import preprocessing
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

In [2]:
data =  pd.read_csv("data/cleanedData.csv")

def get_best_alpha(model,X,y,alphas):
    kf = KFold(n_splits=3, random_state = 0, shuffle=False)
    results = []
    for alpha in alphas:
        model.set_params(alpha=alpha)
        for train_indexes, test_indexes in kf.split(X):
            X_test, X_val = X.values[train_indexes], X.values[test_indexes]
            y_test, y_val = y.values[train_indexes], y.values[test_indexes]

            model.fit(X_test, y_test)
            mse = mean_squared_error(y_val,model.predict(X_val))
            results.append((alpha, mse))
   

    best_alpha, mse = sorted(results, key=lambda x: x[1])[0]
    return best_alpha, mse

#### Need to remove vehicles without CO2 emissions listed

In [3]:
df_NoCO2 = data[data.CO2.isna()]
df_CO2 = data[data.CO2.notnull()]
df_CO2 = df_CO2.dropna()

#### Make all features numeric

In [4]:
#df_CO2 = np.array_split(df_CO2, 2)[0]

In [5]:
y = df_CO2['CO2']
X = df_CO2.drop(columns=['CO2'])

for (columnName, columnData) in X.iteritems():
    if (X[columnName].dtypes == 'object'):
        v = 0
        print('Colunm Name : ', columnName)
        uniqueValues = X[columnName].unique()
        print(uniqueValues)
        for value in uniqueValues:
            X.loc[(X[columnName] == value), columnName] = v
            v += 1

for (columnName, columnData) in X.iteritems():
    print('Colunm Name : ', columnName)
    print(X[columnName].unique())
    
#X = pd.get_dummies(X)

Colunm Name :  Üldine staatus
['REGISTREERITUD' 'PEATATUD']
Colunm Name :  Kategooria
['M1' 'M1G']
Colunm Name :  Mark
['ALFA ROMEO' 'ALPINA' 'AMG HUMMER' 'ASTON MARTIN' 'AUDI' 'BENTLEY' 'BMW'
 'BUICK' 'CADILLAC' 'CHEVROLET' 'CHRYSLER' 'CITROEN' 'DACIA' 'DAEWOO'
 'DAIHATSU' 'DODGE' 'DS' 'FERRARI' 'FIAT' 'FISKER' 'FORD' 'GMC' 'HONDA'
 'HYUNDAI' 'INFINITI' 'IVECO' 'JAGUAR' 'JEEP' 'KIA' 'LADA' 'LAMBORGHINI'
 'LANCIA' 'LAND ROVER' 'LEXUS' 'LINCOLN' 'LOTUS' 'MAN' 'MASERATI' 'MAZDA'
 'MCC' 'MCLAREN' 'MERCEDES-BENZ' 'MG' 'MICRO-VETT' 'MINI' 'MITSUBISHI'
 'MORGAN' 'NISSAN' 'OMAVALMISTATUD' 'OPEL' 'PEUGEOT' 'PLYMOUTH' 'PONTIAC'
 'PORSCHE' 'RENAULT' 'ROLLS-ROYCE' 'ROVER' 'SAAB' 'SEAT' 'SHUANGHUAN'
 'ŠKODA' 'SMART' 'SSANGYONG' 'SUBARU' 'SUZUKI' 'TESLA' 'TOYOTA' 'VAUXHALL'
 'VOLKSWAGEN' 'VOLVO']
Colunm Name :  Mudel
['145' '146' '147' ... 'XC90' 'XC90 T8 TWIN ENGINE'
 'NILSSON V70 AMBULANCE']
Colunm Name :  Keretüüp
['LUUKPÄRA' 'SEDAAN' 'UNIVERSAAL' 'LAHTINE' 'KUPEE' 'SIHTOTSTARBELINE'
 'MAHTUNIVE

#### Split the data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#### Classifiers

In [7]:
model = RandomForestClassifier(random_state = 0, n_estimators = 10).fit(X_train, y_train)
#model = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
lr = LinearRegression().fit(X_train, y_train)
ridge = Ridge().fit(X_train, y_train)
lasso = Lasso().fit(X_train, y_train)

In [8]:
print("LinearRegression:",mean_squared_error(y_test, lr.predict(X_test), squared=False))
print("Ridge:",mean_squared_error(y_test, ridge.predict(X_test), squared=False))
print("Lasso:",mean_squared_error(y_test, lasso.predict(X_test), squared=False))

LinearRegression: 17.999538349511013
Ridge: 17.999539914892566
Lasso: 18.402150816263223


In [9]:
predResults = model.predict(X_test)

In [10]:
acc = accuracy_score(y_test, predResults)
print(model , "accuracy:", acc)

RandomForestClassifier(n_estimators=10, random_state=0) accuracy: 0.8458499606283449


#### Getting better alphas

In [11]:
ridge_alphas = np.linspace(0.1, 10, 100)
lasso_alphas = np.linspace(0.001, 5, 100)

print("Ridge best alpha %.4f - Avg MSE %.4f " % get_best_alpha(Ridge(), X_train, y_train, ridge_alphas))
print("Lasso best alpha %.4f - Avg MSE %.4f " % get_best_alpha(Lasso(), X_train, y_train, lasso_alphas))



Ridge best alpha 0.1000 - Avg MSE 326.0448 




Lasso best alpha 0.0010 - Avg MSE 326.0458 


In [12]:
ridge2 = Ridge(alpha=0.1).fit(X_train, y_train)
lasso2 = Lasso(alpha=0.0010).fit(X_train, y_train)

In [13]:
print("Ridge:",mean_squared_error(y_test, ridge.predict(X_test), squared=False))
print("Lasso:",mean_squared_error(y_test, lasso.predict(X_test), squared=False))

Ridge: 17.999539914892566
Lasso: 18.402150816263223


#### Finding column coefficients

In [17]:
for i in range(len(lr.coef_)):
    print(X.columns[i] + " coefficient:", lr.coef_[i])

Üldine staatus coefficient: 2.5455789509073687
Kategooria coefficient: 11.717551021224313
Mark coefficient: -0.2104087123548282
Mudel coefficient: 0.008480389041215996
Keretüüp coefficient: 1.3632741767083676
Esm reg aasta coefficient: -3.8553116050714538
Värv coefficient: 0.05465664713795378
Mootori tüüp coefficient: -20.735643258450178
Mootori maht coefficient: 0.023219141159114096
Mootori võimsus coefficient: 0.030413666215681274
Tühimass coefficient: 0.05048690181860042
Käigukasti tüüp coefficient: 3.3455438028866533
Telgi kokku coefficient: -1.7763568394002505e-15
Maakond coefficient: -0.09186684473315915
Linn coefficient: -0.009291422105273003


In [22]:
for i in range(len(ridge2.coef_)):
    print(X.columns[i] + " coefficient:", ridge2.coef_[i])

Üldine staatus coefficient: 2.545529750355957
Kategooria coefficient: 11.717441700955316
Mark coefficient: -0.21040909122699059
Mudel coefficient: 0.008480405049116348
Keretüüp coefficient: 1.3632719851389703
Esm reg aasta coefficient: -3.8553126406669596
Värv coefficient: 0.054656794590231435
Mootori tüüp coefficient: -20.73561952670271
Mootori maht coefficient: 0.023219132428947777
Mootori võimsus coefficient: 0.030413867299678706
Tühimass coefficient: 0.05048694334597618
Käigukasti tüüp coefficient: 3.345533692032366
Telgi kokku coefficient: 0.0
Maakond coefficient: -0.09186756308285521
Linn coefficient: -0.009291521086223173


In [19]:
for i in range(len(lasso2.coef_)):
    print(X.columns[i] + " coefficient:", lasso2.coef_[i])

Üldine staatus coefficient: 2.5109596234346023
Kategooria coefficient: 11.700618195594878
Mark coefficient: -0.21040366666909152
Mudel coefficient: 0.00848060737314233
Keretüüp coefficient: 1.362556128118172
Esm reg aasta coefficient: -3.8554827945503054
Värv coefficient: 0.054646984975270034
Mootori tüüp coefficient: -20.73387094130359
Mootori maht coefficient: 0.023219805558437435
Mootori võimsus coefficient: 0.03041718819598196
Tühimass coefficient: 0.05049656812236978
Käigukasti tüüp coefficient: 3.3419866588815736
Telgi kokku coefficient: 0.0
Maakond coefficient: -0.09172435262337053
Linn coefficient: -0.009416904370296186
