In [None]:
# Required Libraries

import pandas as pd
import re
import nltk 
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


# Dataset
  1. Dataset is used from online retail data of a gift shop.
  2. Attributes are InvoiceNo, StockCode, Description, Quantity, InvoiceDate UnitPrice, CustomerID, Country.



In [None]:
data = pd.read_csv('/content/drive/My Drive/Academic projects/Supply Chain Analytics/OnlineRetail.csv',  encoding= 'unicode_escape')
data.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [None]:
# checking actual and unique in each attibutes.

for i in data.columns:
  print("Actual number of values",i,len(data[i]))
  print("Unique number of values",i,len(data[i].unique()))

Actual number of values InvoiceNo 541909
Unique number of values InvoiceNo 25900
Actual number of values StockCode 541909
Unique number of values StockCode 4070
Actual number of values Description 541909
Unique number of values Description 4224
Actual number of values Quantity 541909
Unique number of values Quantity 722
Actual number of values InvoiceDate 541909
Unique number of values InvoiceDate 23260
Actual number of values UnitPrice 541909
Unique number of values UnitPrice 1630
Actual number of values CustomerID 541909
Unique number of values CustomerID 4373
Actual number of values Country 541909
Unique number of values Country 38


In [None]:
# Checking null values in each attributes

data.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [None]:
# Dropped null values because imputation in not possible in Description column and Customer ID

data.dropna(inplace=True)
data.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

# Lets create new fetures from "Description" Column.

- POS tagging of Description column for taking out features as a category.

- Fetched 'noun' as a value and made it a new column name 'Product Type'

- Also take out colours from 'Description' and made a new column name "Colour_type" .

In [None]:
colours = ['red','orange', 'yellow','green', 'blue', 'indigo','violet','purple','pink','silver', 'gold', 'beige', 'brown', 'grey', 'gray', 'black', 'white', 'cream']

stop_words = set(stopwords.words('english'))
Product_type = []
Colour_type = []
dataset=data.head(50000)
for row in dataset.iloc[:,2]:
  s=" "
  description = re.sub('[^a-zA-Z]'," ", str(row).lower()) #cleaning of text data
  wordsList = nltk.word_tokenize(description) #tokenization
  wordsList = [nltk.stem.WordNetLemmatizer().lemmatize(w, 'n') for w in wordsList if not w in stop_words] # lemmitization

  flag=False
  for w in wordsList:
    if w in colours:
      Colour_type.append(w)
      flag=True
      break
  if flag==False:
    Colour_type.append("no_color") #taking out colours from description

  tagged = nltk.pos_tag(wordsList)

  for tag in tagged:
    if tag[1]=='NN' :
      s+=tag[0] +  " "
  Product_type.append(s)

print(Colour_type)
print(Product_type)

['white', 'white', 'cream', 'no_color', 'red', 'no_color', 'no_color', 'no_color', 'red', 'no_color', 'no_color', 'no_color', 'no_color', 'no_color', 'no_color', 'no_color', 'no_color', 'no_color', 'no_color', 'no_color', 'no_color', 'no_color', 'red', 'yellow', 'blue', 'no_color', 'pink', 'red', 'green', 'no_color', 'no_color', 'no_color', 'no_color', 'red', 'no_color', 'no_color', 'no_color', 'no_color', 'no_color', 'red', 'no_color', 'no_color', 'no_color', 'no_color', 'no_color', 'no_color', 'no_color', 'red', 'no_color', 'white', 'white', 'cream', 'red', 'no_color', 'no_color', 'no_color', 'no_color', 'white', 'white', 'white', 'white', 'no_color', 'red', 'no_color', 'no_color', 'no_color', 'white', 'white', 'cream', 'red', 'no_color', 'no_color', 'no_color', 'no_color', 'white', 'white', 'white', 'white', 'no_color', 'red', 'no_color', 'no_color', 'no_color', 'red', 'red', 'no_color', 'pink', 'black', 'no_color', 'no_color', 'red', 'blue', 'red', 'red', 'no_color', 'no_color', 'n

In [None]:
# Inserting new columns in dataframe 

dataset['Product Type']=Product_type
dataset['Colour_type']=Colour_type
dataset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Product Type,Colour_type
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom,heart light holder,white
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,metal lantern,white
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom,cream heart coat hanger,cream
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,union flag water bottle,no_color
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom,heart,red


In [None]:
# Dropped "Description"and "InvoiceDate" columns

X = dataset.drop(["Description","InvoiceDate"],axis=1)

In [None]:
X.info()

# Created a new feature name "Revenue"

  - revenue = UnitPrice * Quantity

In [None]:
X['Revenue'] = X['UnitPrice'] * X['Quantity']
X

Unnamed: 0,InvoiceNo,StockCode,Quantity,UnitPrice,CustomerID,Country,Product Type,Colour_type,Revenue
0,536365,85123A,6,2.55,17850.0,United Kingdom,heart light holder,white,15.30
1,536365,71053,6,3.39,17850.0,United Kingdom,metal lantern,white,20.34
2,536365,84406B,8,2.75,17850.0,United Kingdom,cream heart coat hanger,cream,22.00
3,536365,84029G,6,3.39,17850.0,United Kingdom,union flag water bottle,no_color,20.34
4,536365,84029E,6,3.39,17850.0,United Kingdom,heart,red,20.34
...,...,...,...,...,...,...,...,...,...
79173,542914,20725,5,1.65,16161.0,United Kingdom,lunch bag retrospot,red,8.25
79174,542914,22367,1,1.95,16161.0,United Kingdom,design,no_color,1.95
79175,542914,22634,1,9.95,16161.0,United Kingdom,child breakfast spaceboy,no_color,9.95
79176,542914,21257,1,7.95,16161.0,United Kingdom,box medium,no_color,7.95


In [None]:
# Label encoding of categorical features

from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

for col in ["InvoiceNo", "StockCode", "CustomerID","Country", "Product Type","Colour_type"]:
  X[col] = label_encoder.fit_transform(X[col])

In [None]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 79177
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   InvoiceNo     50000 non-null  int64  
 1   StockCode     50000 non-null  int64  
 2   Quantity      50000 non-null  int64  
 3   UnitPrice     50000 non-null  float64
 4   CustomerID    50000 non-null  int64  
 5   Country       50000 non-null  int64  
 6   Product Type  50000 non-null  int64  
 7   Colour_type   50000 non-null  int64  
 8   Revenue       50000 non-null  float64
dtypes: float64(2), int64(7)
memory usage: 3.8 MB


In [None]:
!pip install kmodes 



In [None]:
# Changed the data type of attributes

X = X.astype('category')
X.iloc[:, 2] = X.iloc[:, 2].astype(float)
X.iloc[:, 3] = X.iloc[:, 3].astype(float)
X.iloc[:, 8] = X.iloc[:, 3].astype(float)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50000 entries, 0 to 79177
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   InvoiceNo     50000 non-null  category
 1   StockCode     50000 non-null  category
 2   Quantity      50000 non-null  float64 
 3   UnitPrice     50000 non-null  float64 
 4   CustomerID    50000 non-null  category
 5   Country       50000 non-null  category
 6   Product Type  50000 non-null  category
 7   Colour_type   50000 non-null  category
 8   Revenue       50000 non-null  float64 
dtypes: category(6), float64(3)
memory usage: 2.3 MB


In [None]:
# Train test split of new dataframe X

from sklearn.model_selection import train_test_split
train, test = train_test_split(X, train_size=0.8, random_state = 0)

# Cluster the similar items for new attribute cluster number, for this K-prototype clustering is used.

In [None]:
# Checking the optimal values of 'K'

import matplotlib.pyplot as plt
from kmodes.kprototypes import KPrototypes

cost = []
for num_clusters in list(range(2,15)):
    kproto = KPrototypes(n_clusters=num_clusters, init='Cao')
    kproto.fit_predict(train, categorical=[0,1,4,5,6,7])
    cost.append(kproto.cost_)
    labels=kproto.labels_
plt.plot(cost)

In [None]:
# New attribute cluster number is generated

kproto = KPrototypes(n_clusters=3, init='Cao')
kproto.fit_predict(train, categorical=[0,1,4,5,6])
print(kproto.cost_)
labels=kproto.labels_

In [None]:
# Adding new attribute

train["Cluster number"]=labels
train

In [None]:
dataset

In [None]:
train.to_csv('/content/drive/My Drive/Academic projects/Supply Chain Analytics/train_clust_num.csv')

In [None]:
# Now added "InvoiceDate" in the dataframe

mergedDf = train.merge(pd.DataFrame(dataset["InvoiceDate"]), left_index=True, right_index=True)
mergedDf

# Feature engineering of "InvoiceDate" column

In [None]:
from datetime import datetime

year = []
month = []
day = []
dayofweek = []
for row in mergedDf["InvoiceDate"]:
  dt=datetime.strptime(row, '%m/%d/%Y %H:%M')
  year.append(dt.year)
  month.append(dt.month)
  day.append(dt.day)
  dayofweek.append(dt.strftime("%w"))

mergedDf['Year']=year
mergedDf["Month"]=month
mergedDf["Day"]=day
mergedDf['DayOfWeek']=dayofweek
mergedDf.drop(['InvoiceDate'],axis=1, inplace=True)
mergedDf.head()

In [None]:
mergedDf.to_csv('/content/drive/My Drive/Academic projects/Supply Chain Analytics/merfedDf.csv')

In [None]:
mergedDf = pd.read_csv('/content/drive/My Drive/Academic projects/Supply Chain Analytics/merfedDf.csv')
mergedDf.drop(['Unnamed: 0'], axis =1, inplace=True)
mergedDf

# Classification of test data into number of clusters

- Cluster numbers were treated as a target variable as the objective
was to match the records from the validation and testing sets with the clusters from the training
set.


In [None]:
# Splitting of merdedDF dataframe into traina nd validation

from sklearn.model_selection import train_test_split
train_, val_= train_test_split(mergedDf, train_size = 0.8, random_state = 0)

In [None]:
train_y=train_["Cluster number"]
train_x=train_.drop(['Cluster number'],axis=1,inplace=False)

val_y=val_["Cluster number"]
val_x=val_.drop(['Cluster number'],axis=1,inplace=False)

For classification
  - Linear SVC is used. and it is giving best result among other machine leaning algorithms.

In [None]:
from sklearn.svm import LinearSVC 

In [1]:
model1 = LinearSVC()
model1.fit(train_x,train_y)

In [None]:
# validating on test data

pred_y = model1.predict(val_x)

In [2]:
# Preformance evaluation

from sklearn.metrics import accuracy_score
accuracy_score(val_y,pred_y)

In [None]:
# Adding "InvoiceDate" in test data

test_Df = test.merge(pd.DataFrame(dataset["InvoiceDate"]), left_index=True, right_index=True)
test_Df

Unnamed: 0,InvoiceNo,StockCode,Quantity,UnitPrice,CustomerID,Country,Product Type,Colour_type,Revenue,InvoiceDate
17873,617,729,1.0,1.25,1158,25,2158,7,1.25,12/8/2010 12:46
29644,985,1181,10.0,1.95,225,25,71,7,1.95,12/14/2010 12:22
73585,2238,1608,1.0,7.95,551,25,1793,5,7.95,1/27/2011 12:40
39495,1339,197,1.0,2.55,726,25,917,7,2.55,12/20/2010 16:08
69859,2106,121,2.0,3.25,409,25,1763,11,3.25,1/24/2011 16:05
...,...,...,...,...,...,...,...,...,...,...
38320,1300,1444,3.0,1.25,1099,25,2213,1,1.25,12/20/2010 11:08
44202,1447,2035,2.0,2.10,692,25,1254,7,2.10,1/5/2011 11:41
75671,2317,216,4.0,1.95,216,25,197,7,1.95,1/30/2011 13:48
58928,1855,1031,3.0,2.10,1225,25,1084,13,2.10,1/16/2011 16:23


In [None]:
# Feature engineering of "InvoiceDate" column

from datetime import datetime

year = []
month = []
day = []
dayofweek = []
for row in test_Df["InvoiceDate"]:
  dt=datetime.strptime(row, '%m/%d/%Y %H:%M')
  year.append(dt.year)
  month.append(dt.month)
  day.append(dt.day)
  dayofweek.append(dt.strftime("%w"))

test_Df['Year']=year
test_Df["Month"]=month
test_Df["Day"]=day
test_Df['DayOfWeek']=dayofweek
test_Df.drop(['InvoiceDate'],axis=1, inplace=True)

test_Df=test_Df.reset_index()
test_Df.drop(["index"],axis=1, inplace = True)
test_Df.head()

test_Df.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,UnitPrice,CustomerID,Country,Product Type,Colour_type,Revenue,Year,Month,Day,DayOfWeek
17873,617,729,1.0,1.25,1158,25,2158,7,1.25,2010,12,8,3
29644,985,1181,10.0,1.95,225,25,71,7,1.95,2010,12,14,2
73585,2238,1608,1.0,7.95,551,25,1793,5,7.95,2011,1,27,4
39495,1339,197,1.0,2.55,726,25,917,7,2.55,2010,12,20,1
69859,2106,121,2.0,3.25,409,25,1763,11,3.25,2011,1,24,1


In [None]:
# Prediction of cluster_number on test data

test_Df['Cluster number'] = model1.predict(test_Df)
test_Df 

Unnamed: 0,InvoiceNo,StockCode,Quantity,UnitPrice,CustomerID,Country,Product Type,Colour_type,Revenue,Year,Month,Day,DayOfWeek,Cluster number
0,617,729,1.0,1.25,1158,25,2158,7,1.25,2010,12,8,3,2
1,985,1181,10.0,1.95,225,25,71,7,1.95,2010,12,14,2,2
2,2238,1608,1.0,7.95,551,25,1793,5,7.95,2011,1,27,4,2
3,1339,197,1.0,2.55,726,25,917,7,2.55,2010,12,20,1,2
4,2106,121,2.0,3.25,409,25,1763,11,3.25,2011,1,24,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1300,1444,3.0,1.25,1099,25,2213,1,1.25,2010,12,20,1,2
9996,1447,2035,2.0,2.10,692,25,1254,7,2.10,2011,1,5,3,2
9997,2317,216,4.0,1.95,216,25,197,7,1.95,2011,1,30,0,2
9998,1855,1031,3.0,2.10,1225,25,1084,13,2.10,2011,1,16,0,2


# Prediction of "Quantity" column for the products demands

In [None]:
train_y=train_["Quantity"].astype('int')
train_x=train_.drop(['Quantity'], axis=1,inplace=False)

test_Df_y=test_Df["Quantity"].astype('int')
test_Df_x=test_Df.drop(['Quantity'],axis=1,inplace=False)

Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(bootstrap=True,ccp_alpha=0.0,
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                                                   n_jobs=None,)
clf.fit(train_x, train_y)

In [None]:
prediction_test = clf.predict(test_Df_x)
prediction_test

In [None]:
from sklearn.metrics import f1_score
f1_score(test_Df_y, prediction_test, average='micro')

In [1]:
accuracy_score(test_Df_y, prediction_test)

In [None]:
# Hyperparameter tuning of RAndom forest algorithm

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint
import numpy as np


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)



# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_x, train_y)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 35.3min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 166.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 310.6min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [2]:
# KNN

from sklearn.neighbors import KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_x, train_y)
knn=neigh.predict(test_Df_x)
print(accuracy_score(test_Df_y, knn))

In [3]:
# SVC with kernel 

from sklearn import svm
from sklearn.svm import SVC

rbf_svc = svm.SVC(kernel='rbf')
rbf_svc.fit(train_x, train_y)

rbf=rbf_svc.predict(test_Df_x)
accuracy_score(test_Df_y, rbf)


In [4]:
# AdaBoost

 from sklearn.ensemble import AdaBoostClassifier

ad = AdaBoostClassifier(n_estimators=100, random_state=0)
ad.fit(train_x, train_y)
adb=ad.predict(test_Df_x)
print(accuracy_score(test_Df_y, adb))

In [None]:
# logistic

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(multi_class='ovr')
lr.fit(train_x, train_y)
lrc=lr.predict(test_Df_x)
print(accuracy_score(test_Df_y, lrc))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.0248


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [5]:
# Naive base Classifier

from sklearn.naive_bayes import GaussianNB

lr = GaussianNB()
lr.fit(train_x, train_y)
lrc=lr.predict(test_Df_x)
print(accuracy_score(test_Df_y, lrc))

In [6]:
# Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier 

dtree_model = DecisionTreeClassifier().fit(train_x, train_y)
dtree_predictions = dtree_model.predict(test_Df_x)
accuracy_score(test_Df_y, dtree_predictions)

In [None]:
# GradientBoostingClassifier

from sklearn.ensemble import GradientBoostingClassifier

gb=GradientBoostingClassifier()
gb.fit(train_x, train_y)
gbc=lr.predict(test_Df_x)
print(accuracy_score(test_Df_y, gbc))