## Import Necessary Modules 

In [1]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd

#import the necessary modelling algos.

#classifiaction.
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
 

#regression
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor


#model selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#preprocessing
from sklearn.preprocessing import MinMaxScaler,StandardScaler,Imputer,LabelEncoder

#evaluation metrics
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  # for classification

# Loading the Data File  

In [2]:
data = pd.read_csv("data.csv", sep = "\t")
data.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
1,1000002,P00285442,M,25-35,16,C,5,0,8,15.0,10.0,7969
2,1000003,P00193542,M,26-35,15,A,3,0,1,2.0,11.0,15227
3,1000004,P00184942,M,46-50,7,B,2,1,1,8.0,17.0,19215
4,1000004,P00346142,M,46-50,7,B,2,1,1,15.0,10.0,15854


In [3]:
len(data)

49

### Checking for Null Values 

In [4]:
data.isnull().sum() 

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            1
Product_Category_3            1
Purchase                      0
dtype: int64

### Replacing Null values with Mean

In [5]:
data["Product_Category_2"] = data["Product_Category_2"].fillna(data["Product_Category_2"].mean())
data["Product_Category_3"] = data["Product_Category_3"].fillna(data["Product_Category_3"].mean())


In [6]:
data.isnull().sum()

User_ID                       0
Product_ID                    0
Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
Product_Category_2            0
Product_Category_3            0
Purchase                      0
dtype: int64

In [7]:
data.columns.duplicated().sum()

0

### Finding No. of Unique Terms in data set 

In [8]:
data.nunique(axis = 'rows')

User_ID                       12
Product_ID                    49
Gender                         2
Age                            6
Occupation                     9
City_Category                  3
Stay_In_Current_City_Years     9
Marital_Status                 2
Product_Category_1             8
Product_Category_2            19
Product_Category_3            18
Purchase                      49
dtype: int64

In [9]:
data['Purchase'].head()

0    15200
1     7969
2    15227
3    19215
4    15854
Name: Purchase, dtype: int64

### Splitting Data into Train and Test data sets 

In [10]:
x = data.drop(['Purchase'], axis = 1)
y = data['Purchase']
x = pd.get_dummies(x)
print('Total Features: {}'.format(x.shape[1]))

Total Features: 67


In [11]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.30, random_state = 0)


print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)

(34, 67) (15, 67) (34,) (15,)


### Applying Machine Learning Algorithm

In [12]:
Regressors = [LinearRegression(),
             KNeighborsRegressor(n_neighbors=20),
             SVR(kernel='rbf'),
             DecisionTreeRegressor(max_depth=15, min_samples_leaf=100)]


Regressor_names = ['Linear Regression',
                  'KNN',
                  'Support Vector Regression',
                  'Decision Tree Regression']

In [13]:
rmse=[]
d={}

for i in range(len(Regressors)):
    rgr=Regressors[i]
    rgr.fit(train_x,train_y)
    y_pred = rgr.predict(test_x)
    mse = mean_squared_error(test_y, y_pred)
    rmse.append(np.sqrt(mse))
    
     
d={'Modelling Algorithm': Regressor_names,'RMSE':rmse}
d

{'Modelling Algorithm': ['Linear Regression',
  'KNN',
  'Support Vector Regression',
  'Decision Tree Regression'],
 'RMSE': [3074.9600605747405,
  4004.4047617592305,
  4093.376645017606,
  4181.145779173607]}

In [14]:
result= pd.DataFrame(d)

In [15]:
result

Unnamed: 0,Modelling Algorithm,RMSE
0,Linear Regression,3074.960061
1,KNN,4004.404762
2,Support Vector Regression,4093.376645
3,Decision Tree Regression,4181.145779
