# Fuel Consumption Prediction
Made by Praneesh Sharma


## Importing Libraries

In [110]:
import numpy as np # linear algebra
import pandas as pd # data processing

import matplotlib.pyplot as plt #to visualize data
import seaborn as sns #to visualize data

from sklearn.preprocessing import StandardScaler #to scale the data
from sklearn.model_selection import train_test_split #to spilt the data into test and train sets

from sklearn.linear_model import LinearRegression #linear regression library
from sklearn.tree import DecisionTreeRegressor #decision tree library

from sklearn import metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.metrics import accuracy_score,confusion_matrix

## Data Cleaning

In [111]:
#importing the dataset
df = pd.read_csv("../input/vehicle-fuelconsumption/competition_edu_dataset.csv")

In [112]:
#displaying the dataset
df

In [113]:
# displaying the number of rows and columns
z=df.shape
print("Number of columns: ", z[0])
print("Number of rows: ", z[1])

In [114]:
df.info()

##### There are 300 entries in total and 300 non-null count in each column. Therefore, there are no null values in the dataset.
##### The type of the 'horsepower' column is shown as object, which means there are some non-float values in that column.

In [115]:
df.describe()

In [116]:
#Let's review the relationship between the columns to examine the correlations between the features and the target.
numeric_cols = df.select_dtypes(include = [np.number])
corr = numeric_cols.corr()
print ('Top 5 Correlated Features with MPG:'), print (corr['MPG'].sort_values(ascending = False)[:5], '\n')
print ('Top 5 Uncorrelated Features with MPG:'), print (corr['MPG'].sort_values(ascending = False)[-5:])

In [117]:
plt.scatter(x = df['Acceleration'], y = df['MPG'])
plt.ylabel('MPG')
plt.xlabel('Acceleration')

##### to a certain extent, MPG increases with acceleration.

In [118]:
plt.scatter(x = df['Weight'], y = df['MPG'])
plt.ylabel('MPG')
plt.xlabel('Weight')

##### MPG decreases with an increase in weight.
##### We can conclude that: the more the weight, the less the fuel efficiency.

In [119]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

plt.figure(figsize=(12,6))
sns.distplot(df['MPG'], hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4})

##### This is a right, ie, positive skewed graph.

### Preprocessing

In [120]:
df #original dataset

In [121]:
data = df.copy()  # making a copy of the dataframe so that the changes made here wont affect the original dataset

In [122]:
data #copied dataset

In [123]:
(data == "?").sum(axis=0)
#the horsepower column had the type object, this is why

In [124]:
data['Horsepower'] = data['Horsepower'].replace('?', np.NaN).astype(float) #replacing the ? with NaN values as type float
data['Horsepower'] = data['Horsepower'].fillna(data['Horsepower'].mean())  #filling the missing NaN cells with the mean value of the horsepower column

In [125]:
len(data['Car Name'].unique()) 

There are some duplicates in the 'Car Name' column

To perform one hot encoding of the Car Names, only the first word, ie, the company needs to be taken.

In [126]:
import re #takes a particular part of a string from a column name

#creating a Company column
data['Company'] = df['Car Name'].apply(lambda x: re.search(r'^\w+', x).group(0))
#dropping the Car Name column
data = data.drop('Car Name', axis = 1)

data #modified dataset

In [127]:
data['Company'].value_counts()

There are some typos in the Company column.

In [128]:
#a dictionary with the error-correction as key-value pair
correction = {
    'toyouta' : 'toyota',
    'vokswagen' : 'volkswagen',
    'chevroelt' : 'chevrolet',
    'maxda' : 'mazda',
    'vw' : 'volkswagen'
}

data['Company'] = data['Company'].replace(correction) #replacing typos with correct spellings

In [129]:
data['Company'].value_counts()

In [130]:
len(data['Company'].value_counts()) == len(data['Company'].unique())

##### As you can see, all the typos have been removed

In [131]:
data['Cylinders'].unique()

In [132]:
data['Origin'].unique()

In [133]:
print(len(data['Cylinders'].unique()), len(data['Origin'].unique()), len(data['Company'].unique()))

##### Hence, the cylinderr, origin, and company columns have nominal values, and can be onehot encoded

In [134]:
nominal_dict = {
    'Cylinders' : 'Cyl',
    'Origin' : 'Orig',
    'Company' : 'Comp'
}

for column, prefix in nominal_dict.items():     #onehot encoding the nominal columns
    dummies = pd.get_dummies(data[column], prefix=prefix)
    data = pd.concat([data, dummies], axis=1)
    data = data.drop(column, axis=1)

In [135]:
data #the new dataframe with onehot encoded columns

##### Now, we will seperate the column of the value to be predicted from the main dataframe

In [136]:
mpg = data['MPG'].copy()
data = data.drop('MPG', axis=1)

## Model Creation

### Splitting the data

In [137]:
X = data
Y = mpg
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, random_state=70)

In [138]:
X_train

### Scaling the data

In [139]:
scaler = StandardScaler() #used the sklearn.preprocessing.StandardScaler to scale the train dataset
scaler.fit(X_train)

X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns) #scaling the train dataset
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns) #scaling the test dataset

In [140]:
 X_train #the scaled train dataset

### Training

#### Using Linear Regression

In [141]:
#using linear regression
model = LinearRegression()
model.fit(X_train, Y_train) #the fit method is used to fit the model to the dataset. This will ingest the data and learn from it

In [142]:
#predicting train
train_pred=model.predict(X_train)
#predicting on test
test_pred=model.predict(X_test)

In [143]:
print('RSquared value on train: {:.4f}'.format(model.score(X_train, Y_train)))
print('RSquared value on test: {:.4f}'.format(model.score(X_test, Y_test)))

In [144]:
print("The accuracy of this model on testing comes out as {:.2f}%".format(model.score(X_test, Y_test)*100))

In [145]:
plt.figure(figsize=(18,10))
x_ax = range(len(Y_test))
plt.plot(x_ax, Y_test, label="original")
plt.plot(x_ax, test_pred, label="predicted")
plt.title("Actual Fuel comsumption and predicted data")
plt.legend()
plt.show()

#### Using Decision Tree

In [146]:
DT=DecisionTreeRegressor()
DT.fit(X_train,Y_train)

In [147]:
#predicting train
train_preds=DT.predict(X_train)
#predicting on test
test_preds=DT.predict(X_test)

In [148]:
print('RSquared value on train: {:.4f}'.format(DT.score(X_train, Y_train)))
print('RSquared value on test: {:.4f}'.format(DT.score(X_test, Y_test)))

In [149]:
print("The accuracy of this model on testing comes out as {:.2f}%".format(DT.score(X_test, Y_test)*100))

In [150]:
plt.figure(figsize=(18,10))
x_ax = range(len(Y_test))
plt.plot(x_ax, Y_test, label="original")
plt.plot(x_ax, test_preds, label="predicted")
plt.title("Actual Fuel comsumption and predicted data")
plt.legend()
plt.show()