In [56]:
path = 'data.csv'

In [70]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import Lasso,Ridge,LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.metrics import mean_squared_error

In [58]:
data = pd.read_csv(path)
print data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
mpg             398 non-null float64
cylinders       398 non-null int64
displacement    398 non-null float64
horsepower      398 non-null object
weight          398 non-null int64
acceleration    398 non-null float64
model_year      398 non-null int64
origin          398 non-null int64
car_name        398 non-null object
dtypes: float64(3), int64(4), object(2)
memory usage: 28.1+ KB
None


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,chevroletchevellemalibu
1,15.0,8,350.0,165,3693,11.5,70,1,buickskylark320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouthsatellite
3,16.0,8,304.0,150,3433,12.0,70,1,amcrebelsst
4,17.0,8,302.0,140,3449,10.5,70,1,fordtorino


In [59]:
#dealing with missing values
data = data[data['horsepower']!='?']
data['horsepower'] = data['horsepower'].astype('float64')

In [60]:
train,test = train_test_split(data,random_state=9,test_size=0.2)

In [61]:
X_train = train.iloc[:,1:]
y_train = train.iloc[:,0]
X_test = test.iloc[:,1:]
y_test = test.iloc[:,0]

In [62]:
print X_train.info()
print X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 313 entries, 107 to 388
Data columns (total 8 columns):
cylinders       313 non-null int64
displacement    313 non-null float64
horsepower      313 non-null float64
weight          313 non-null int64
acceleration    313 non-null float64
model_year      313 non-null int64
origin          313 non-null int64
car_name        313 non-null object
dtypes: float64(3), int64(4), object(1)
memory usage: 22.0+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 79 entries, 290 to 54
Data columns (total 8 columns):
cylinders       79 non-null int64
displacement    79 non-null float64
horsepower      79 non-null float64
weight          79 non-null int64
acceleration    79 non-null float64
model_year      79 non-null int64
origin          79 non-null int64
car_name        79 non-null object
dtypes: float64(3), int64(4), object(1)
memory usage: 5.6+ KB
None


In [63]:
#categorical and numerical values
numeric_feature_columns = list(X_train._get_numeric_data().columns)
categorical_feature_columns = list(set(X_train.columns)-set(X_train._get_numeric_data().columns))
for column in numeric_feature_columns:
    if pd.unique(X_train[column]).shape[0] < np.sqrt(X_train.shape[0]):
        categorical_feature_columns.append(column)
        numeric_feature_columns.remove(column)

In [64]:
categorical_feature_columns.append('origin')
numeric_feature_columns.remove('origin')
print numeric_feature_columns
print categorical_feature_columns

['displacement', 'horsepower', 'weight', 'acceleration']
['car_name', 'cylinders', 'model_year', 'origin']


In [65]:
X_train['car_name'].value_counts()
#one hot encoding cannot be done
#label encoding can be done
#since almost all the values are unique it serves as an id and can b dropped

peugeot504                        4
chevroletimpala                   4
toyotacorolla                     4
amcmatador                        4
amchornet                         3
fordpinto                         3
toyotacorona                      3
volkswagendasher                  3
amcgremlin                        3
fordmaverick                      3
chevroletcitation                 3
plymouthfuryiii                   3
chevroletvega                     3
pontiaccatalina                   3
chevroletchevette                 3
chevroletnova                     3
fordgrantorino                    3
dodgecolt                         2
fiat128                           2
fordltd                           2
saab99le                          2
chevroletmalibu                   2
oldsmobilecutlasssalonbrougham    2
plymouthreliant                   2
buickskylark                      2
audi100ls                         2
fordgrantorino(sw)                2
mazda626                    

In [66]:
X_train = X_train.drop('car_name',axis=1)
X_test = X_test.drop('car_name',axis=1)

In [67]:
#basic model
linR = LinearRegression()
linR.fit(X_train,y_train)
y_pred = linR.predict(X_test)
mean_squared_error(y_test, y_pred)

11.027954778424236

In [68]:
#using ridge
rid = Ridge(alpha=1000)
rid.fit(X_train,y_train)
y_pred = rid.predict(X_test)
mean_squared_error(y_test, y_pred)

9.661824694190706

In [69]:
#correlation
train.corr()
# all columns are correlated with target variable (mpg)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
mpg,1.0,-0.767988,-0.795293,-0.776623,-0.821549,0.445445,0.606431,0.590513
cylinders,-0.767988,1.0,0.951748,0.843675,0.895089,-0.523436,-0.355354,-0.580867
displacement,-0.795293,0.951748,1.0,0.893572,0.93463,-0.55618,-0.390274,-0.62325
horsepower,-0.776623,0.843675,0.893572,1.0,0.871373,-0.697825,-0.424324,-0.463201
weight,-0.821549,0.895089,0.93463,0.871373,1.0,-0.430969,-0.334786,-0.588595
acceleration,0.445445,-0.523436,-0.55618,-0.697825,-0.430969,1.0,0.291088,0.205201
model_year,0.606431,-0.355354,-0.390274,-0.424324,-0.334786,0.291088,1.0,0.253654
origin,0.590513,-0.580867,-0.62325,-0.463201,-0.588595,0.205201,0.253654,1.0


In [73]:
# Scaling values
for val in numeric_feature_columns:
    sca = MinMaxScaler()
    sca.fit(X_train[val].reshape(-1,1))
    X_train[val] = sca.transform(X_train[val].reshape(-1,1))
    X_test[val] = sca.transform(X_test[val].reshape(-1,1))

  after removing the cwd from sys.path.
  """
  


In [74]:
#using ridge
# Thus scaling not required
rid = Ridge(alpha=1000)
rid.fit(X_train,y_train)
y_pred = rid.predict(X_test)
mean_squared_error(y_test, y_pred)

25.687089119861362

Not much could be done with this dataset,still got a very low mean_Square_error of 9.66 using Ridge regul