# (1) Company Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("D:\Assignment\Random Forests\Company_Data.csv")

In [3]:
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.4,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No


In [4]:
df.describe()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,Age,Education
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,7.496325,124.975,68.6575,6.635,264.84,115.795,53.3225,13.9
std,2.824115,15.334512,27.986037,6.650364,147.376436,23.676664,16.200297,2.620528
min,0.0,77.0,21.0,0.0,10.0,24.0,25.0,10.0
25%,5.39,115.0,42.75,0.0,139.0,100.0,39.75,12.0
50%,7.49,125.0,69.0,5.0,272.0,117.0,54.5,14.0
75%,9.32,135.0,91.0,12.0,398.5,131.0,66.0,16.0
max,16.27,175.0,120.0,29.0,509.0,191.0,80.0,18.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    object 
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    object 
 10  US           400 non-null    object 
dtypes: float64(1), int64(7), object(3)
memory usage: 34.5+ KB


In [6]:
df.isna().sum()

Sales          0
CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
dtype: int64

In [7]:
df.duplicated().sum()

0

In [8]:
df = pd.get_dummies(df, columns=['Urban', 'US'], drop_first=True)

In [9]:
df['ShelveLoc'] = df['ShelveLoc'].map({'Good':1,'Medium':2,'Bad':3})

In [10]:
df.rename(columns= {'Urban_Yes':'Urban', 'US_Yes':'US'},inplace=True)
df.head()

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.5,138,73,11,276,120,3,42,17,1,1
1,11.22,111,48,16,260,83,1,65,10,1,1
2,10.06,113,35,10,269,80,2,59,12,1,1
3,7.4,117,100,4,466,97,2,55,14,1,1
4,4.15,141,64,3,340,128,3,38,13,1,0


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Sales        400 non-null    float64
 1   CompPrice    400 non-null    int64  
 2   Income       400 non-null    int64  
 3   Advertising  400 non-null    int64  
 4   Population   400 non-null    int64  
 5   Price        400 non-null    int64  
 6   ShelveLoc    400 non-null    int64  
 7   Age          400 non-null    int64  
 8   Education    400 non-null    int64  
 9   Urban        400 non-null    uint8  
 10  US           400 non-null    uint8  
dtypes: float64(1), int64(8), uint8(2)
memory usage: 29.0 KB


In [12]:
x = df.iloc[:,[i for i in range(0,6)] + [i for i in range(7,11)]]
y = df['ShelveLoc']

In [13]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.25, random_state=1)

In [14]:
# Creating function for different model

def Algo(model):
    model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)
    print(classification_report(ytest, ypred))
    print('Training Data Accuracy', model.score(xtrain, ytrain))
    print('Test Data Accuracy', model.score(xtest, ytest))

### Random Forest 

In [15]:
Algo(RandomForestClassifier(n_estimators=15,max_depth=2))

              precision    recall  f1-score   support

           1       0.56      0.29      0.38        17
           2       0.62      0.95      0.75        59
           3       0.00      0.00      0.00        24

    accuracy                           0.61       100
   macro avg       0.39      0.41      0.38       100
weighted avg       0.46      0.61      0.51       100

Training Data Accuracy 0.6333333333333333
Test Data Accuracy 0.61


In [16]:
Algo(BaggingClassifier(DecisionTreeClassifier(max_depth=2),n_estimators=1))

              precision    recall  f1-score   support

           1       0.60      0.18      0.27        17
           2       0.60      0.76      0.67        59
           3       0.40      0.33      0.36        24

    accuracy                           0.56       100
   macro avg       0.53      0.42      0.44       100
weighted avg       0.55      0.56      0.53       100

Training Data Accuracy 0.64
Test Data Accuracy 0.56


In [17]:
rf = RandomForestClassifier()
rf.fit(xtrain,ytrain)
ypred = rf.predict(xtest)

kf = KFold(n_splits=10)
score = cross_val_score(RandomForestClassifier(),x,y,cv=kf)
score.mean()

0.645

***Other Ensemble Techniques***

In [18]:
Algo(DecisionTreeClassifier(max_depth=2))

              precision    recall  f1-score   support

           1       0.64      0.41      0.50        17
           2       0.62      0.68      0.65        59
           3       0.36      0.38      0.37        24

    accuracy                           0.56       100
   macro avg       0.54      0.49      0.51       100
weighted avg       0.56      0.56      0.56       100

Training Data Accuracy 0.6666666666666666
Test Data Accuracy 0.56


In [19]:
Algo(DecisionTreeClassifier(criterion='entropy', max_depth=2))

              precision    recall  f1-score   support

           1       0.64      0.41      0.50        17
           2       0.64      0.88      0.74        59
           3       0.50      0.17      0.25        24

    accuracy                           0.63       100
   macro avg       0.59      0.49      0.50       100
weighted avg       0.61      0.63      0.58       100

Training Data Accuracy 0.6566666666666666
Test Data Accuracy 0.63


In [20]:
Algo(AdaBoostClassifier(n_estimators=10))

              precision    recall  f1-score   support

           1       0.75      0.35      0.48        17
           2       0.64      0.95      0.77        59
           3       0.80      0.17      0.28        24

    accuracy                           0.66       100
   macro avg       0.73      0.49      0.51       100
weighted avg       0.70      0.66      0.60       100

Training Data Accuracy 0.66
Test Data Accuracy 0.66


In [21]:
Algo(GradientBoostingClassifier(learning_rate=0.034,n_estimators=9))

              precision    recall  f1-score   support

           1       1.00      0.12      0.21        17
           2       0.61      1.00      0.76        59
           3       1.00      0.04      0.08        24

    accuracy                           0.62       100
   macro avg       0.87      0.39      0.35       100
weighted avg       0.77      0.62      0.50       100

Training Data Accuracy 0.6266666666666667
Test Data Accuracy 0.62


In [22]:
models = []

models.append(('lr',LogisticRegression()))
models.append(('dt', DecisionTreeClassifier()))
models.append(('dt1', DecisionTreeClassifier(criterion='entropy')))
models.append(('knn',KNeighborsClassifier()))
models.append(('rf',RandomForestClassifier()))

In [23]:
Algo(StackingClassifier(estimators=models, final_estimator= DecisionTreeClassifier()))

              precision    recall  f1-score   support

           1       0.55      0.71      0.62        17
           2       0.71      0.59      0.65        59
           3       0.41      0.50      0.45        24

    accuracy                           0.59       100
   macro avg       0.56      0.60      0.57       100
weighted avg       0.61      0.59      0.60       100

Training Data Accuracy 0.54
Test Data Accuracy 0.59


In [24]:
Algo(VotingClassifier(estimators= models))

              precision    recall  f1-score   support

           1       0.57      0.47      0.52        17
           2       0.68      0.85      0.76        59
           3       0.62      0.33      0.43        24

    accuracy                           0.66       100
   macro avg       0.62      0.55      0.57       100
weighted avg       0.65      0.66      0.64       100

Training Data Accuracy 1.0
Test Data Accuracy 0.66


---------------------------------------------------------------------------------------------------------------------------

# (2) Fraud Check 

In [25]:
df = pd.read_csv("D:\Assignment\Random Forests\Fraud_check.csv")

In [26]:
df.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [28]:
df.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [29]:
df.duplicated().sum()

0

In [30]:
df = pd.get_dummies(df, columns= ['Undergrad', 'Marital.Status', 'Urban'],drop_first= True)

In [31]:
df.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES
count,600.0,600.0,600.0,600.0,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333,0.52,0.323333,0.361667,0.503333
std,26204.827597,49850.075134,8.842147,0.500017,0.468139,0.480884,0.500406
min,10003.0,25779.0,0.0,0.0,0.0,0.0,0.0
25%,32871.5,66966.75,8.0,0.0,0.0,0.0,0.0
50%,55074.5,106493.5,15.0,1.0,0.0,0.0,1.0
75%,78611.75,150114.25,24.0,1.0,1.0,1.0,1.0
max,99619.0,199778.0,30.0,1.0,1.0,1.0,1.0


In [32]:
#Creating new cols TaxInc and dividing 'Taxable.Income' cols on the basis of Taxable.Income <= 30000 for Risky and Good
conditions = [
    (df['Taxable.Income'] <= 30000),
    (df['Taxable.Income'] > 30000)
    ]

# create a list of the values we want to assign for each condition
values = ['Risky', 'Good']

# create a new column and use np.select to assign values to it using our lists as arguments
df['TaxInc'] = np.select(conditions, values)

# display updated DataFrame
df

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES,TaxInc
0,68833,50047,10,0,0,1,1,Good
1,33700,134075,18,1,0,0,1,Good
2,36925,160205,30,0,1,0,1,Good
3,50190,193264,15,1,0,1,1,Good
4,81002,27533,28,0,1,0,0,Good
...,...,...,...,...,...,...,...,...
595,76340,39492,7,1,0,0,1,Good
596,69967,55369,2,1,0,0,1,Good
597,47334,154058,0,0,0,0,1,Good
598,98592,180083,17,1,1,0,0,Good


***Lets assume: taxable_income <= 30000 as “Risky = 1” and others are “Good = 0”***

In [33]:
df = pd.get_dummies(df,columns = ["TaxInc"],drop_first=True)

In [34]:
df.head()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES,TaxInc_Risky
0,68833,50047,10,0,0,1,1,0
1,33700,134075,18,1,0,0,1,0
2,36925,160205,30,0,1,0,1,0
3,50190,193264,15,1,0,1,1,0
4,81002,27533,28,0,1,0,0,0


In [35]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [36]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=3)

In [37]:
# Creating function for different model

def Algo(model):
    model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)
    print(classification_report(ytest, ypred))
    print('Training Data Accuracy', model.score(xtrain, ytrain))
    print('Test Data Accuracy', model.score(xtest, ytest))

### Random Forest 

In [38]:
Algo(RandomForestClassifier())

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        91
           1       1.00      1.00      1.00        29

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120

Training Data Accuracy 1.0
Test Data Accuracy 1.0


In [39]:
Algo(BaggingClassifier(DecisionTreeClassifier()))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        91
           1       1.00      1.00      1.00        29

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120

Training Data Accuracy 1.0
Test Data Accuracy 1.0


In [40]:
rf = RandomForestClassifier()
rf.fit(xtrain,ytrain)
ypred = rf.predict(xtest)

kf = KFold(n_splits=10)
score = cross_val_score(RandomForestClassifier(),x,y,cv=kf)
score.mean()

0.9983333333333334

***Other Ensemble Techniques***

In [41]:
Algo(DecisionTreeClassifier())

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        91
           1       1.00      1.00      1.00        29

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120

Training Data Accuracy 1.0
Test Data Accuracy 1.0


In [42]:
Algo(DecisionTreeClassifier(criterion='entropy'))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        91
           1       1.00      1.00      1.00        29

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120

Training Data Accuracy 1.0
Test Data Accuracy 1.0


In [43]:
Algo(AdaBoostClassifier())

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        91
           1       1.00      1.00      1.00        29

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120

Training Data Accuracy 1.0
Test Data Accuracy 1.0


In [44]:
Algo(GradientBoostingClassifier())

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        91
           1       1.00      1.00      1.00        29

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120

Training Data Accuracy 1.0
Test Data Accuracy 1.0


In [45]:
models = []

models.append(('lr',LogisticRegression()))
models.append(('dt', DecisionTreeClassifier()))
models.append(('dt1', DecisionTreeClassifier(criterion='entropy')))
models.append(('knn',KNeighborsClassifier()))
models.append(('rf',RandomForestClassifier()))

In [46]:
Algo(StackingClassifier(estimators=models, final_estimator= RandomForestClassifier()))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        91
           1       1.00      1.00      1.00        29

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120

Training Data Accuracy 1.0
Test Data Accuracy 1.0


In [47]:
Algo(VotingClassifier(estimators= models))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        91
           1       1.00      1.00      1.00        29

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120

Training Data Accuracy 1.0
Test Data Accuracy 1.0
