In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import tree

# Survived Factor Project

## Random Forest

In [3]:
from sklearn.ensemble import RandomForestClassifier

In [4]:
le = preprocessing.LabelEncoder()

In [5]:
df = pd.read_csv("train.csv")

In [6]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
df['Sex'] = le.fit_transform(df.Sex)

In [8]:
rf_model = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)

In [9]:
features = ["Age","Sex","Fare","Pclass","SibSp"]

In [10]:
rf_model.fit(X=df[features], y=df.Survived)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [11]:
print("oob accuracy")
print(rf_model.oob_score_)

oob accuracy
0.8143982002249719


In [12]:
# now getting imp features

In [13]:
for feature,imp in zip(features,rf_model.feature_importances_):
    print(feature,imp)

Age 0.2735410885826881
Sex 0.27499455826326086
Fare 0.3100738753883132
Pclass 0.08986700289127016
SibSp 0.051523474874467734



## Now We got important variables as Age, Sex and fare

In [14]:
tree_model = tree.DecisionTreeClassifier()

In [15]:
feature = ["Age","Sex","Fare"]

In [16]:
tree_model.fit(X=df[feature], y=df["Survived"])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

## For Visualization

In [17]:
with open("survived.dot","w") as f:
    f= tree.export_graphviz(tree_model, feature_names=["Age","Sex","Fare"], out_file=f);

## Prediction

In [18]:
df1= pd.read_csv("test.csv")

In [19]:
df1["Sex"] = le.fit_transform(df1.Sex)

In [20]:
test_features = pd.DataFrame([df1["Age"], df1["Sex"], df1["Fare"]]).T

In [21]:
test_prediction = tree_model.predict(X=test_features)

In [22]:
preds = pd.DataFrame({"Passenger ID":df1['PassengerId'], "Survived":test_prediction})

In [23]:
preds.to_csv("suroutput.csv", index=False)

# Attrition analysis

In [92]:
data = pd.read_csv("general_data.csv")

In [93]:
data.head(2)

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4


In [94]:
from sklearn.model_selection import train_test_split

In [95]:
data["Attrition"] = le.fit_transform(data.Attrition)
data["BusinessTravel"] = le.fit_transform(data.BusinessTravel)
data["Department"] = le.fit_transform(data.Department)
data["EducationField"] = le.fit_transform(data.EducationField)
data["Gender"] = le.fit_transform(data.Gender)
data["Over18"] = le.fit_transform(data.Over18)
data["JobRole"]= le.fit_transform(data.JobRole)
data["MaritalStatus"]= le.fit_transform(data.MaritalStatus)

In [96]:
data = data.fillna(method='bfill')

data = data.fillna(method='ffill')

In [97]:
data.isna().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeID                 0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
Over18                     0
PercentSalaryHike          0
StandardHours              0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

In [98]:
data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [99]:
data = data.drop(['Over18','EmployeeCount', 'EmployeeID','StandardHours'],axis=1)

In [100]:

data = data.dropna()
data1 = data.drop_duplicates()

In [101]:
rf = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)

In [102]:
afeatures = ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField','Gender','JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked','PercentSalaryHike','StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [109]:
rf_model.fit(X=data[afeatures], y=data["Attrition"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [106]:
print(rf_model.oob_score_)

0.9997732426303855


In [107]:
for feature,imp in zip(afeatures, rf_model.feature_importances_):
    print(feature,"\t",imp);

Age 	 0.09665924370739223
BusinessTravel 	 0.02793614721604086
Department 	 0.026369101935471227
DistanceFromHome 	 0.0697079304112162
Education 	 0.04050388121413213
EducationField 	 0.04158723268796424
Gender 	 0.018997178079289977
JobLevel 	 0.037206825517244135
JobRole 	 0.05533737842164919
MaritalStatus 	 0.03994192853142236
MonthlyIncome 	 0.09358999667202055
NumCompaniesWorked 	 0.0560120435293305
PercentSalaryHike 	 0.06553028015936062
StockOptionLevel 	 0.03422595161954537
TotalWorkingYears 	 0.08653688185128805
TrainingTimesLastYear 	 0.045442507153101226
YearsAtCompany 	 0.06800017230357239
YearsSinceLastPromotion 	 0.04281779257408914
YearsWithCurrManager 	 0.053597526415869574


## Age, Monthly Income, TotalWorkingYears

In [116]:
att_model = tree.DecisionTreeClassifier(max_depth=6,max_leaf_nodes=10)

In [117]:
feat = pd.DataFrame([data['Age'],data['MonthlyIncome'],data['TotalWorkingYears']]).T
att_model.fit(X=feat, y=data["Attrition"])



DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [118]:
with open("atree.dot",'w') as  f:
    f=tree.export_graphviz(att_model,feature_names=['Age','MonthlyIncome','TotalWorkingYears'],out_file=f);

## if TotalWorkingYears is < 1.5 and monthly income is between 23000 to 110000 then the chance of attrition is high

## if TotalWorkingYears is > 1.5 , Age is < 33 then rate of attrition is very low

# Bank Loan Modelling

In [31]:
dataset = pd.read_excel("Bank_Personal_Loan_Modelling.xlsx",sheet_name=1)

In [27]:
bankr_model = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)

In [33]:
dataset.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [34]:
dataset1 = dataset.drop(['ID','ZIP Code'], axis =1)
dataset2 = dataset1.dropna()
dataset3 = dataset2.drop_duplicates()
dataset3.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [36]:
dataset3['CCAvg'] = np.round(dataset3['CCAvg'])
dataset3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,2.0,1,0,0,1,0,0,0
1,45,19,34,3,2.0,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,3.0,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [44]:
bfeat = ['Age', 'Experience', 'Income','Family', 'CCAvg',
       'Education', 'Mortgage', 'Securities Account',
       'CD Account', 'Online', 'CreditCard']

In [45]:
bankr_model.fit(X=df10[bfeat], y=df10["Personal Loan"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [46]:
print(bankr_model.oob_score_)

0.987


In [47]:
for feature,imp in zip(bfeat, bankr_model.feature_importances_):
    print(feature,"\t",imp)

Age 	 0.0450254833604272
Experience 	 0.044344567613691065
Income 	 0.35086307206398726
Family 	 0.09340777068616538
CCAvg 	 0.18096516693709105
Education 	 0.1590143584656015
Mortgage 	 0.04662051603640426
Securities Account 	 0.00549846576742232
CD Account 	 0.05552289333112569
Online 	 0.00863366844902175
CreditCard 	 0.010104037289062354


## Income, CCAvg, Education

In [56]:
bank_model = tree.DecisionTreeClassifier(max_depth=6, max_leaf_nodes=10)
predictors = pd.DataFrame([dataset3["Income"],dataset3["CCAvg"],dataset3["Education"]]).T

bank_model.fit(X=bfeat1, y=dataset3["Personal Loan"])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=10,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [61]:
with open("Dtree.dot",'w') as f:
    f = tree.export_graphviz(bank_model, feature_names=['Education','CCAvg','Income'], out_file=f);

# When income is less than 100 dollar probability of getting loan is less