In [1]:
import pandas as pd
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

### Company Dataset

In [2]:
dataframe = pd.read_csv("Company_Data.csv")

In [3]:
dataframe

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


In [4]:
labels=['bad','average','good']
bins=[0,5,10,17]
dataframe['Sales']=pd.cut(dataframe['Sales'],bins=bins,labels=labels)

In [5]:
dataframe

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,average,138,73,11,276,120,Bad,42,17,Yes,Yes
1,good,111,48,16,260,83,Good,65,10,Yes,Yes
2,good,113,35,10,269,80,Medium,59,12,Yes,Yes
3,average,117,100,4,466,97,Medium,55,14,Yes,Yes
4,bad,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,good,138,108,17,203,128,Good,33,14,Yes,Yes
396,average,139,23,3,37,120,Medium,55,11,No,Yes
397,average,162,26,12,368,159,Medium,40,18,Yes,Yes
398,average,100,79,7,284,95,Bad,50,12,Yes,Yes


In [6]:
dataframe["Sales"] = dataframe["Sales"].astype(object)

In [7]:
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

In [8]:
dataframe.dtypes

Sales          object
CompPrice       int64
Income          int64
Advertising     int64
Population      int64
Price           int64
ShelveLoc      object
Age             int64
Education       int64
Urban          object
US             object
dtype: object

In [9]:
string_columns = ["Sales","ShelveLoc","Urban","US"]   
for x in string_columns:
    dataframe[x] = dataframe[x].astype(str)
    
for i in string_columns:
    number = preprocessing.LabelEncoder()
    dataframe[i] = number.fit_transform(dataframe[i])

In [10]:
dataframe

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,0,138,73,11,276,120,0,42,17,1,1
1,2,111,48,16,260,83,1,65,10,1,1
2,2,113,35,10,269,80,2,59,12,1,1
3,0,117,100,4,466,97,2,55,14,1,1
4,1,141,64,3,340,128,0,38,13,1,0
...,...,...,...,...,...,...,...,...,...,...,...
395,2,138,108,17,203,128,1,33,14,1,1
396,0,139,23,3,37,120,2,55,11,0,1
397,0,162,26,12,368,159,2,40,18,1,1
398,0,100,79,7,284,95,0,50,12,1,1


In [11]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
array = dataframe.values
X = array[:,1:]
Y = array[:,0]
num_trees = 200
max_features = 7
kfold = KFold(n_splits=30, random_state=7)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())



0.7197802197802201


In [12]:
model.fit(X,Y)

RandomForestClassifier(max_features=7, n_estimators=200)

In [13]:
y_pred = model.predict(X)

In [14]:
y_pred_df= pd.DataFrame({'actual': Y,
                         'predicted': model.predict(X)})

In [15]:
y_pred_df

Unnamed: 0,actual,predicted
0,0,0
1,2,2
2,2,2
3,0,0
4,1,1
...,...,...
395,2,2
396,0,0
397,0,0
398,0,0


In [16]:
from sklearn.metrics import classification_report
print(classification_report(Y,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       245
           1       1.00      1.00      1.00        76
           2       1.00      1.00      1.00        78
           3       1.00      1.00      1.00         1

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400



### Fraud Check Dataset

In [17]:
dataframe1 = pd.read_csv("Fraud_check.csv")

In [18]:
dataframe1

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [19]:
Data = dataframe1.rename({'Marital.Status': 'Marital_Status','Taxable.Income':'Taxable_Income',
                          'City.Population':'City_Population','Work.Experience':'Work_Experience'}, axis=1)

In [20]:
Data

Unnamed: 0,Undergrad,Marital_Status,Taxable_Income,City_Population,Work_Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [21]:
Data.describe()

Unnamed: 0,Taxable_Income,City_Population,Work_Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


In [22]:
labels=['Good','Risky']
bins=[10003,50000,99620]
Data['Taxable_Income']=pd.cut(Data['Taxable_Income'],bins=bins,labels=labels)

In [23]:
Data["Taxable_Income"] = Data["Taxable_Income"].astype(object)

In [24]:
Data.dtypes

Undergrad          object
Marital_Status     object
Taxable_Income     object
City_Population     int64
Work_Experience     int64
Urban              object
dtype: object

In [25]:
Data.head()

Unnamed: 0,Undergrad,Marital_Status,Taxable_Income,City_Population,Work_Experience,Urban
0,NO,Single,Risky,50047,10,YES
1,YES,Divorced,Good,134075,18,YES
2,NO,Married,Good,160205,30,YES
3,YES,Single,Risky,193264,15,YES
4,NO,Married,Risky,27533,28,NO


In [26]:
Data.dtypes

Undergrad          object
Marital_Status     object
Taxable_Income     object
City_Population     int64
Work_Experience     int64
Urban              object
dtype: object

In [27]:
string_columns = ["Taxable_Income","Urban","Marital_Status","Undergrad"]  

for x in string_columns:
    Data[x] = Data[x].astype(str)
    
for i in string_columns:
    number = preprocessing.LabelEncoder()
    Data[i] = number.fit_transform(Data[i])

In [28]:
Data

Unnamed: 0,Undergrad,Marital_Status,Taxable_Income,City_Population,Work_Experience,Urban
0,0,2,1,50047,10,1
1,1,0,0,134075,18,1
2,0,1,0,160205,30,1
3,1,2,1,193264,15,1
4,0,1,1,27533,28,0
...,...,...,...,...,...,...
595,1,0,1,39492,7,1
596,1,0,1,55369,2,1
597,0,0,0,154058,0,1
598,1,1,1,180083,17,0


In [29]:
x1=Data[['Undergrad','Marital_Status','City_Population','Work_Experience','Urban']]
y1=Data['Taxable_Income']

In [30]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

num_trees =200
max_features = 3
kfold = KFold(n_splits=50, random_state=7)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, x1, y1, cv=kfold)
print(results.mean())



0.505


In [31]:
model.fit(x1,y1)

RandomForestClassifier(max_features=3, n_estimators=200)

In [32]:
y_pred1 = model.predict(x1)

In [33]:
y_pred_df1= pd.DataFrame({'actual': y1,
                         'predicted': model.predict(x1)})

In [34]:
y_pred_df1

Unnamed: 0,actual,predicted
0,1,1
1,0,0
2,0,0
3,1,1
4,1,1
...,...,...
595,1,1
596,1,1
597,0,0
598,1,1


In [35]:
print(classification_report(y1,y_pred1))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       269
           1       1.00      1.00      1.00       330
           2       1.00      1.00      1.00         1

    accuracy                           1.00       600
   macro avg       1.00      1.00      1.00       600
weighted avg       1.00      1.00      1.00       600

