# 8 steps of Machine Learning
1. Data gathering
2. data preprocessing
3. Exploratory Data Analysis (EDA)
4. Feature engineering/selection
5. Training model
6. Test model/Model evaluation
7. Hyper Parameter tuning
8. Prediction

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 1. Data gathering

In [None]:
data=pd.read_csv('FuelConsumption.csv')
data.head()

Check the basic info and missing values

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
# Return DataFrame with duplicate rows removed.
data.drop_duplicates(inplace=True)

In [None]:
# count categorical unique value from make column
data['MAKE'].value_counts()
data.MAKE.value_counts()

In [None]:
# count categorical unique value from MODEL column
data['MODEL'].value_counts()

In [None]:
# count categorical unique value from VEHICLECLASS column
data['VEHICLECLASS'].value_counts()

In [None]:
# count categorical unique value from TRANSMISSION  column
data['TRANSMISSION'].value_counts()

In [None]:
data['FUELTYPE'].unique() # getting the name of unique value

In [None]:
# count categorical unique value from FUELTYPE  column
data['FUELTYPE'].value_counts()

In [None]:
data['MODELYEAR'].value_counts()

In [None]:
# drop MODELYEAR COLUMN
data.drop(columns=['MODELYEAR'], inplace=True)

Statistical Analysis

In [None]:
data.describe().T

# 2. Data preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [None]:
# each column transform categorical to numerical value and  update to dataframe
for col in ['MAKE','MODEL','VEHICLECLASS','TRANSMISSION','FUELTYPE']:
    data[col]=encoder.fit_transform(data[col]) 

In [None]:
data

# 3. EDA

In [None]:
sns.set(style='white')
plt.figure(figsize=(10,8))
plt.scatter(x=data['ENGINESIZE'],y=data['CO2EMISSIONS'])
plt.title('ENGINSIZE VS C02EMISSIONS')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x='ENGINESIZE', y='CO2EMISSIONS', data=data)
plt.title('ENGINESIZE VE CO2EMISSIONS')
plt.show()

In [None]:
sns.distplot(data.ENGINESIZE,kde=False)
data['ENGINESIZE'].plot(kind='hist') # hist/bar
data.ENGINESIZE.plot(kind='hist') # by this plot we can know the distribution of data over x axis

In [None]:
plt.figure(figsize=(10,10))
sns.pairplot(data=data)

In [None]:
sns.boxenplot(data=data, x='ENGINESIZE', )

# 4. Feature Engineering/Selection

Finding important features

In [None]:
correlation=data.corr()
correlation

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(correlation, annot=True,cmap='Greens')

In [None]:
# Droping unnecessary ffeatures/column
data.drop(columns=['MAKE', 'MODEL', 'VEHICLECLASS', 'TRANSMISSION',
          'FUELTYPE', 'FUELCONSUMPTION_COMB_MPG'], inplace=True)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(correlation, annot=True,cmap='Greens')

In [None]:
data

Detect Quartile and remove

In [None]:
Q1=np.percentile(data.ENGINESIZE, 25, method='midpoint')
Q1

In [None]:
Q3=np.percentile(data.ENGINESIZE, 75, method='midpoint')
Q3

In [None]:
iQR=Q3-Q1
iQR

In [None]:
Q3+1.5*iQR

In [None]:
Q1-1.5*iQR

In [None]:
Outliers_index_upper=np.where(data['ENGINESIZE']>Q3+1.5*iQR)
Outliers_index_upper[0]

In [None]:
Outliers_index_lower=np.where(data['ENGINESIZE']<Q1-1.5*iQR)
Outliers_index_lower[0]

In [None]:
# Droping Outliers_index
data.drop(Outliers_index_upper[0], inplace=True)

In [None]:
data

In [None]:
sns.boxenplot(data['ENGINESIZE'])

# 5. Training model

Train & test split

In [None]:
from sklearn.model_selection import train_test_split

When ues a single feature for modeling then we called simple linear Regression,
on the other hand when use one features for modeling then called multipul linear Regression

In [None]:
# simple linear rgression
train_features, test_features, train_target, test_target = train_test_split(
    data[['ENGINESIZE']], data[['CO2EMISSIONS']], test_size=0.2,random_state=2)


In [None]:
test_features

In [None]:
test_features
test_target

Modeling 

In [None]:
from sklearn.linear_model import LinearRegression
model=LinearRegression()

In [None]:
# training our model
model.fit(train_features,train_target)

In [None]:
# theta 0
model.intercept_ # intercept_ refers theta zero value, it will be always single value

In [None]:
# theta 1
model.coef_ # coef_ means coefficient which refers theta one values , it may be multipul value

In [None]:
test_target

In [None]:
x=test_features.values[0] # return ENGINESIZE OF ZERO INDEX
yhat=model.intercept_ +(model.coef_*x) # mathematical calculation
yhat


In [None]:
# return CO2EMISSION OF ZERO INDEX
test_target.values[0]   # .values[0]  conert Dataferm to numpy

# 6. Model testing and Evaluation

In [None]:
from sklearn.metrics import mean_squared_error, r2_score #sklearn.metrics contain all error evaluation module 

In [None]:
# mannuly Error calculations
y_hat=model.predict(test_features) # prediction
np.sum((test_target - y_hat)**2)/213


In [None]:
# error calculation
mean_squared_error(test_target,y_hat) # similar with previous one

In [None]:
# test accuricy
r2_score(test_target, y_hat) # or

r2_score(test_target, y_hat)*100

In [None]:
# Checking error and accurecy for single features
for features in data.columns[:-1]:
    print('__________')
    train_features, test_features, train_target, test_target = train_test_split(
    data[[features]], data[['CO2EMISSIONS']], test_size=0.2,random_state=2)

    model=LinearRegression()
    model.fit(train_features,train_target)
    y_hat=model.predict(test_features)
    print(f'ERROR OF {features}: ',mean_squared_error(test_target,y_hat))
    print(f'ACCURICY OF {features}: ',r2_score(test_target, y_hat)*100)

Multiple Linear Regression

In [None]:
from itertools import combinations # module of combinations

In [None]:
# Checking error and accurecy for combine features
feature_combinations = list(combinations(['ENGINESIZE', 'CYLINDERS','FUELCONSUMPTION_CITY',
                            'FUELCONSUMPTION_HWY', 'FUELCONSUMPTION_COMB'], 3))  # 2 refers  numbers of combination
for features in feature_combinations:
    print('---------------')
    train_features, test_features, train_target, test_target = train_test_split(
        data[list(features)], data[['CO2EMISSIONS']], test_size=0.2,random_state=2)
    model=LinearRegression()
    model.fit(train_features,train_target)
    y_hat=model.predict(test_features)
    print(f'ERROR OF {features}: ',mean_squared_error(test_target,y_hat))
    print(f'ACCURICY OF {features}: ',r2_score(test_target, y_hat)*100)

Cross validation 

 k-fold Cross-Validation is a statistical method used to estimate the skill of machine learning models.

In [None]:
features=data.drop(columns=['CO2EMISSIONS'])
features

In [67]:
target=data[['CO2EMISSIONS']]
target

Unnamed: 0,CO2EMISSIONS
0,196
1,221
2,136
3,255
4,244
...,...
1062,271
1063,264
1064,271
1065,260


In [None]:
from sklearn.model_selection import KFold # module of k-fold Cross-Validation

In [None]:
KFold? # return detail about dataset

In [None]:
folds =KFold(random_state=2)# this give us index of training and testing dataset

In [None]:
data.columns

In [None]:
data.describe().T