In [288]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [289]:
df = pd.read_csv('surveyA.csv') #load dataset

In [290]:
df.info                         #checking initial dataset info

<bound method DataFrame.info of       age     race gender     employment          education married  \
0      30     Kree      M       Employed        High School     Yes   
1      43  Sapiens      M  Self-employed  Bachelor's Degree     Yes   
2      28   Skrull      F       Employed  Bachelor's Degree     Yes   
3      57     Kree      F  Self-employed        High School     Yes   
4      51   Skrull      F     Unemployed        High School     Yes   
...   ...      ...    ...            ...                ...     ...   
2221   63  Sapiens      M  Self-employed        High School     Yes   
2222   59   Others      M  Self-employed        High School     Yes   
2223   64  Sapiens      M       Employed        High School     Yes   
2224   34     Kree      M       Employed            Diploma      No   
2225   32     Kree      F  Self-employed       Certificates      No   

     person_living_in_house        salary                          house_type  \
0                         3      1

In [291]:
df.head(10)         #preview first 10 rows of dataset

Unnamed: 0,age,race,gender,employment,education,married,person_living_in_house,salary,house_type,house_value,...,transport_use,transport_spending,public_transport_spending,house_utility,food_spending,kids_spending,personal_loan,education_loan,other_loan,investment
0,30,Kree,M,Employed,High School,Yes,3,1K to 2K,Rental house - flat,Below 100K,...,Own transport,250.0,0.0,120.0,300.0,160.0,0.0,0.0,200.0,0.0
1,43,Sapiens,M,Self-employed,Bachelor's Degree,Yes,3,5K to 6K,Own house - condominiums,300K to 400K,...,Own transport,700.0,0.0,350.0,1000.0,500.0,300.0,0.0,0.0,0.0
2,28,Skrull,F,Employed,Bachelor's Degree,Yes,5,2K to 3K,Parent's house,,...,Own transport,600.0,0.0,300.0,600.0,100.0,0.0,0.0,0.0,0.0
3,57,Kree,F,Self-employed,High School,Yes,4,1K to 2K,Own house - kampung / wooden house,Below 100K,...,Own transport,100.0,0.0,200.0,800.0,450.0,0.0,0.0,0.0,0.0
4,51,Skrull,F,Unemployed,High School,Yes,5,3K to 4K,Own house - flat,Below 100K,...,,0.0,0.0,180.0,0.0,0.0,0.0,0.0,0.0,0.0
5,76,Kree,M,Government retiree,High School,Yes,2,1K to 2K,Own house - double storey terrace,,...,Own transport,100.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0
6,63,Kree,F,Unemployed,High School,Divorcee,3,1K to 2K,Own house - double storey terrace,500K to 600K,...,Own transport,550.0,0.0,100.0,1000.0,0.0,0.0,0.0,0.0,0.0
7,34,Kree,F,Employed,Bachelor's Degree,No,8,3K to 4K,Own house - double storey terrace,500K to 600K,...,Own transport,800.0,0.0,300.0,1000.0,0.0,0.0,200.0,0.0,0.0
8,24,Kree,M,Employed,Diploma,No,6,1K to 2K,Parent's house,,...,Own transport,400.0,0.0,200.0,200.0,0.0,0.0,0.0,0.0,0.0
9,59,Sapiens,M,Employed,Diploma,Yes,4,10K or more,Own house - double storey terrace,600K to 700K,...,,0.0,0.0,600.0,4000.0,0.0,0.0,0.0,0.0,0.0


In [292]:
df.shape     #2226 rows and 23 columns

(2226, 23)

In [293]:
df.columns    #name of initial attributes

Index(['age', 'race', 'gender', 'employment', 'education', 'married',
       'person_living_in_house', 'salary', 'house_type', 'house_value',
       'vehicle', 'house_rental_fee', 'house_loan_pmt', 'transport_use',
       'transport_spending', 'public_transport_spending', 'house_utility',
       'food_spending', 'kids_spending', 'personal_loan', 'education_loan',
       'other_loan', 'investment'],
      dtype='object')

In [294]:
display(df.dtypes.value_counts())     #find out the count of each attributes's data type

object     11
float64    11
int64       1
dtype: int64

In [295]:
df.isnull().sum()    #checking missing values of each attributes

age                            0
race                           0
gender                         0
employment                     2
education                      3
married                        3
person_living_in_house         5
salary                        58
house_type                     9
house_value                  957
vehicle                       29
house_rental_fee             153
house_loan_pmt                87
transport_use                525
transport_spending            32
public_transport_spending    231
house_utility                 12
food_spending                 11
kids_spending                 27
personal_loan                132
education_loan               133
other_loan                   114
investment                   152
dtype: int64

In [296]:
df.dtypes         #checking each attributes' data type

age                            int64
race                          object
gender                        object
employment                    object
education                     object
married                       object
person_living_in_house        object
salary                        object
house_type                    object
house_value                   object
vehicle                       object
house_rental_fee             float64
house_loan_pmt               float64
transport_use                 object
transport_spending           float64
public_transport_spending    float64
house_utility                float64
food_spending                float64
kids_spending                float64
personal_loan                float64
education_loan               float64
other_loan                   float64
investment                   float64
dtype: object

In [297]:
#dropping house_type and transport_use column
df = df.drop(['house_type','transport_use'], axis=1)
df.head()

Unnamed: 0,age,race,gender,employment,education,married,person_living_in_house,salary,house_value,vehicle,...,house_loan_pmt,transport_spending,public_transport_spending,house_utility,food_spending,kids_spending,personal_loan,education_loan,other_loan,investment
0,30,Kree,M,Employed,High School,Yes,3,1K to 2K,Below 100K,Local brand car,...,0.0,250.0,0.0,120.0,300.0,160.0,0.0,0.0,200.0,0.0
1,43,Sapiens,M,Self-employed,Bachelor's Degree,Yes,3,5K to 6K,300K to 400K,Asia brand car,...,950.0,700.0,0.0,350.0,1000.0,500.0,300.0,0.0,0.0,0.0
2,28,Skrull,F,Employed,Bachelor's Degree,Yes,5,2K to 3K,,Local brand car,...,0.0,600.0,0.0,300.0,600.0,100.0,0.0,0.0,0.0,0.0
3,57,Kree,F,Self-employed,High School,Yes,4,1K to 2K,Below 100K,Motorcycle,...,0.0,100.0,0.0,200.0,800.0,450.0,0.0,0.0,0.0,0.0
4,51,Skrull,F,Unemployed,High School,Yes,5,3K to 4K,Below 100K,Asia brand car,...,0.0,0.0,0.0,180.0,0.0,0.0,0.0,0.0,0.0,0.0


In [298]:
df.isnull().sum()    #checking missing values of each attributes

age                            0
race                           0
gender                         0
employment                     2
education                      3
married                        3
person_living_in_house         5
salary                        58
house_value                  957
vehicle                       29
house_rental_fee             153
house_loan_pmt                87
transport_spending            32
public_transport_spending    231
house_utility                 12
food_spending                 11
kids_spending                 27
personal_loan                132
education_loan               133
other_loan                   114
investment                   152
dtype: int64

In [299]:
#replacing missing values with mode
df['salary'] = df['salary'].fillna(df['salary'].mode()[0])

In [300]:
df.isnull().sum()

age                            0
race                           0
gender                         0
employment                     2
education                      3
married                        3
person_living_in_house         5
salary                         0
house_value                  957
vehicle                       29
house_rental_fee             153
house_loan_pmt                87
transport_spending            32
public_transport_spending    231
house_utility                 12
food_spending                 11
kids_spending                 27
personal_loan                132
education_loan               133
other_loan                   114
investment                   152
dtype: int64

In [301]:
df['salary']

0           1K to 2K
1           5K to 6K
2           2K to 3K
3           1K to 2K
4           3K to 4K
            ...     
2221        2K to 3K
2222    Less than 1K
2223        2K to 3K
2224        7K to 8K
2225        1K to 2K
Name: salary, Length: 2226, dtype: object

In [302]:
#replacing missing values with mode
df['house_value'] = df['house_value'].fillna(df['house_value'].mode()[0])

In [303]:
df.isnull().sum()

age                            0
race                           0
gender                         0
employment                     2
education                      3
married                        3
person_living_in_house         5
salary                         0
house_value                    0
vehicle                       29
house_rental_fee             153
house_loan_pmt                87
transport_spending            32
public_transport_spending    231
house_utility                 12
food_spending                 11
kids_spending                 27
personal_loan                132
education_loan               133
other_loan                   114
investment                   152
dtype: int64

In [304]:
#replacing missing values with mode
df['employment'] = df['employment'].fillna(df['employment'].mode()[0])
df['education'] = df['education'].fillna(df['education'].mode()[0])
df['married'] = df['married'].fillna(df['married'].mode()[0])
df['person_living_in_house'] = df['person_living_in_house'].fillna(df['person_living_in_house'].mode()[0])
df['vehicle'] = df['vehicle'].fillna(df['vehicle'].mode()[0])
df['house_rental_fee'] = df['house_rental_fee'].fillna(df['house_rental_fee'].mode()[0])
df['house_loan_pmt'] = df['house_loan_pmt'].fillna(df['house_loan_pmt'].mode()[0])
df['transport_spending'] = df['transport_spending'].fillna(df['transport_spending'].mode()[0])
df['public_transport_spending'] = df['public_transport_spending'].fillna(df['public_transport_spending'].mode()[0])
df['house_utility'] = df['house_utility'].fillna(df['house_utility'].mode()[0])
df['food_spending'] = df['food_spending'].fillna(df['food_spending'].mode()[0])
df['kids_spending'] = df['kids_spending'].fillna(df['kids_spending'].mode()[0])
df['personal_loan'] = df['personal_loan'].fillna(df['personal_loan'].mode()[0])
df['education_loan'] = df['education_loan'].fillna(df['education_loan'].mode()[0])
df['other_loan'] = df['other_loan'].fillna(df['other_loan'].mode()[0])
df['investment'] = df['investment'].fillna(df['investment'].mode()[0])

In [305]:
df.isnull().sum()           #no missing values left

age                          0
race                         0
gender                       0
employment                   0
education                    0
married                      0
person_living_in_house       0
salary                       0
house_value                  0
vehicle                      0
house_rental_fee             0
house_loan_pmt               0
transport_spending           0
public_transport_spending    0
house_utility                0
food_spending                0
kids_spending                0
personal_loan                0
education_loan               0
other_loan                   0
investment                   0
dtype: int64

In [306]:
df.head()

Unnamed: 0,age,race,gender,employment,education,married,person_living_in_house,salary,house_value,vehicle,...,house_loan_pmt,transport_spending,public_transport_spending,house_utility,food_spending,kids_spending,personal_loan,education_loan,other_loan,investment
0,30,Kree,M,Employed,High School,Yes,3,1K to 2K,Below 100K,Local brand car,...,0.0,250.0,0.0,120.0,300.0,160.0,0.0,0.0,200.0,0.0
1,43,Sapiens,M,Self-employed,Bachelor's Degree,Yes,3,5K to 6K,300K to 400K,Asia brand car,...,950.0,700.0,0.0,350.0,1000.0,500.0,300.0,0.0,0.0,0.0
2,28,Skrull,F,Employed,Bachelor's Degree,Yes,5,2K to 3K,Below 100K,Local brand car,...,0.0,600.0,0.0,300.0,600.0,100.0,0.0,0.0,0.0,0.0
3,57,Kree,F,Self-employed,High School,Yes,4,1K to 2K,Below 100K,Motorcycle,...,0.0,100.0,0.0,200.0,800.0,450.0,0.0,0.0,0.0,0.0
4,51,Skrull,F,Unemployed,High School,Yes,5,3K to 4K,Below 100K,Asia brand car,...,0.0,0.0,0.0,180.0,0.0,0.0,0.0,0.0,0.0,0.0


In [307]:
#reset value of each columns into integer for easier analysis
rename_value = {"race": {"Kree":1, "Sapiens":2, "Skrull":3, "Others":4},
               "gender": {"F":1, "M":2},
               "employment": {"Employed":1, "Government retiree":2, "Self-employed":3, "Private sector retiree":4, "Unemployed":5, "Others":6},
               "education": {"Bachelor's Degree":1,"Diploma":2, "Certificates":3, "High School":4},
               "married": {"No":0, "Yes":1, "Divorcee":2},
               "person_living_in_house": {"10 or more" : 10},
               "salary" : {"1K to 2K":1, "2K to 3K":1, "3K to 4K":1, "4K to 5K":1,"Less than 1K":1, "5K to 6K":2, "7K to 8K":2, "8K to 9K":2, "9K to 10K":2,"10K or more":2},
               "house_value": {"Below 100K":1, "100K to 200K":2, "200K to 300K":3, "300K to 400K":4,"400K to 500K":5, "500K to 600K":6,"600K to 700K":7,"700K to 800K":8,"800K to 1M":9, "More than 1M":10},
               "vehicle": {"Asia brand car":1, "Did not own any vehicle":2, "Europe brand car":3, "Local brand car":4, "Motorcycle":5}}

In [308]:
df = df.replace(rename_value)
df

Unnamed: 0,age,race,gender,employment,education,married,person_living_in_house,salary,house_value,vehicle,...,house_loan_pmt,transport_spending,public_transport_spending,house_utility,food_spending,kids_spending,personal_loan,education_loan,other_loan,investment
0,30,1,2,1,4,1,3,1,1,4,...,0.0,250.0,0.0,120.0,300.0,160.0,0.0,0.0,200.0,0.0
1,43,2,2,3,1,1,3,2,4,1,...,950.0,700.0,0.0,350.0,1000.0,500.0,300.0,0.0,0.0,0.0
2,28,3,1,1,1,1,5,1,1,4,...,0.0,600.0,0.0,300.0,600.0,100.0,0.0,0.0,0.0,0.0
3,57,1,1,3,4,1,4,1,1,5,...,0.0,100.0,0.0,200.0,800.0,450.0,0.0,0.0,0.0,0.0
4,51,3,1,5,4,1,5,1,1,1,...,0.0,0.0,0.0,180.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2221,63,2,2,3,4,1,5,1,6,5,...,0.0,40.0,0.0,100.0,200.0,0.0,0.0,0.0,200.0,0.0
2222,59,4,2,3,4,1,4,1,1,1,...,0.0,200.0,0.0,90.0,150.0,0.0,0.0,0.0,0.0,0.0
2223,64,2,2,1,4,1,5,1,4,1,...,700.0,900.0,9.0,100.0,1000.0,53.0,0.0,0.0,0.0,0.0
2224,34,1,2,1,2,0,4,2,1,3,...,0.0,1200.0,0.0,400.0,2000.0,0.0,500.0,0.0,0.0,300.0


In [309]:
df = df.astype({'person_living_in_house':'int'})
print(df.dtypes)

age                            int64
race                           int64
gender                         int64
employment                     int64
education                      int64
married                        int64
person_living_in_house         int32
salary                         int64
house_value                    int64
vehicle                        int64
house_rental_fee             float64
house_loan_pmt               float64
transport_spending           float64
public_transport_spending    float64
house_utility                float64
food_spending                float64
kids_spending                float64
personal_loan                float64
education_loan               float64
other_loan                   float64
investment                   float64
dtype: object


In [310]:
df['person_living_in_house'] = pd.to_numeric(df['person_living_in_house'], errors="coerce").fillna(0).astype('int64')
#converting int32 to int64

In [311]:
df.dtypes

age                            int64
race                           int64
gender                         int64
employment                     int64
education                      int64
married                        int64
person_living_in_house         int64
salary                         int64
house_value                    int64
vehicle                        int64
house_rental_fee             float64
house_loan_pmt               float64
transport_spending           float64
public_transport_spending    float64
house_utility                float64
food_spending                float64
kids_spending                float64
personal_loan                float64
education_loan               float64
other_loan                   float64
investment                   float64
dtype: object

In [312]:
df['age'] = df['age'].astype(float)

In [313]:
df.dtypes

age                          float64
race                           int64
gender                         int64
employment                     int64
education                      int64
married                        int64
person_living_in_house         int64
salary                         int64
house_value                    int64
vehicle                        int64
house_rental_fee             float64
house_loan_pmt               float64
transport_spending           float64
public_transport_spending    float64
house_utility                float64
food_spending                float64
kids_spending                float64
personal_loan                float64
education_loan               float64
other_loan                   float64
investment                   float64
dtype: object

In [314]:
df['race'] = df['race'].astype(float)
df['gender'] = df['gender'].astype(float)
df['employment'] = df['employment'].astype(float)
df['education'] = df['education'].astype(float)
df['married'] = df['married'].astype(float)
df['person_living_in_house'] = df['person_living_in_house'].astype(float)
df['salary'] = df['salary'].astype(float)
df['house_value'] = df['house_value'].astype(float)
df['vehicle'] = df['vehicle'].astype(float)

In [315]:
df.dtypes

age                          float64
race                         float64
gender                       float64
employment                   float64
education                    float64
married                      float64
person_living_in_house       float64
salary                       float64
house_value                  float64
vehicle                      float64
house_rental_fee             float64
house_loan_pmt               float64
transport_spending           float64
public_transport_spending    float64
house_utility                float64
food_spending                float64
kids_spending                float64
personal_loan                float64
education_loan               float64
other_loan                   float64
investment                   float64
dtype: object

In [316]:
#rearranging columns for easier x and y assignment
column_names= ["age","race","gender","employment","education","married","person_living_in_house","house_value","vehicle","house_rental_fee","house_loan_pmt","transport_spending","public_transport_spending",
              "house_utility","food_spending","kids_spending","personal_loan","education_loan","other_loan","investment","salary"]

df1=df.reindex(columns=column_names)

In [317]:
df1

Unnamed: 0,age,race,gender,employment,education,married,person_living_in_house,house_value,vehicle,house_rental_fee,...,transport_spending,public_transport_spending,house_utility,food_spending,kids_spending,personal_loan,education_loan,other_loan,investment,salary
0,30.0,1.0,2.0,1.0,4.0,1.0,3.0,1.0,4.0,450.0,...,250.0,0.0,120.0,300.0,160.0,0.0,0.0,200.0,0.0,1.0
1,43.0,2.0,2.0,3.0,1.0,1.0,3.0,4.0,1.0,0.0,...,700.0,0.0,350.0,1000.0,500.0,300.0,0.0,0.0,0.0,2.0
2,28.0,3.0,1.0,1.0,1.0,1.0,5.0,1.0,4.0,0.0,...,600.0,0.0,300.0,600.0,100.0,0.0,0.0,0.0,0.0,1.0
3,57.0,1.0,1.0,3.0,4.0,1.0,4.0,1.0,5.0,0.0,...,100.0,0.0,200.0,800.0,450.0,0.0,0.0,0.0,0.0,1.0
4,51.0,3.0,1.0,5.0,4.0,1.0,5.0,1.0,1.0,0.0,...,0.0,0.0,180.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2221,63.0,2.0,2.0,3.0,4.0,1.0,5.0,6.0,5.0,0.0,...,40.0,0.0,100.0,200.0,0.0,0.0,0.0,200.0,0.0,1.0
2222,59.0,4.0,2.0,3.0,4.0,1.0,4.0,1.0,1.0,0.0,...,200.0,0.0,90.0,150.0,0.0,0.0,0.0,0.0,0.0,1.0
2223,64.0,2.0,2.0,1.0,4.0,1.0,5.0,4.0,1.0,0.0,...,900.0,9.0,100.0,1000.0,53.0,0.0,0.0,0.0,0.0,1.0
2224,34.0,1.0,2.0,1.0,2.0,0.0,4.0,1.0,3.0,0.0,...,1200.0,0.0,400.0,2000.0,0.0,500.0,0.0,0.0,300.0,2.0


In [318]:
#assigning x and y values/preparing data for modelling
# independent variables
X = df1.iloc[:, :-1].values
#dependent variable
y = df1.iloc[:, -1:].values

In [319]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import metrics 
from sklearn.metrics import mean_squared_error,accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [320]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [322]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.90


In [323]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[563  11]
 [ 53  41]]


In [324]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.91      0.98      0.95       574
         2.0       0.79      0.44      0.56        94

    accuracy                           0.90       668
   macro avg       0.85      0.71      0.75       668
weighted avg       0.90      0.90      0.89       668



In [321]:
#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [325]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

In [326]:
# Predict the salary for the test data
y_pred = linreg.predict(X_test)

In [327]:
mse = mean_squared_error(y_test, y_pred)

print('Mean squared error:', mse)


In [328]:
print("Accuracy on Test Data : {:,.2f}".format(linreg.score(X_test , y_test) *100 ) , '%')

In [347]:
def predictsalary(model , file):
#     import numpy as np
#     import pandas as pd
#     import matplotlib.pyplot as plt
#     import seaborn as sns
#     from sklearn.model_selection import train_test_split

#     import warnings
#     warnings.simplefilter(action="ignore", category=FutureWarning)
    
#     from sklearn.linear_model import LinearRegression, LogisticRegression
#     from sklearn import metrics 
#     from sklearn.metrics import mean_squared_error,accuracy_score, confusion_matrix, classification_report
#     from sklearn.model_selection import train_test_split
    
    dftest = pd.read_csv(file)
    dftest = dftest.drop(['house_type','transport_use'], axis=1)
    dftest['salary'] = dftest['salary'].fillna(dftest['salary'].mode()[0])
    dftest['house_value'] = dftest['house_value'].fillna(dftest['house_value'].mode()[0])
    dftest['employment'] = dftest['employment'].fillna(dftest['employment'].mode()[0])
    dftest['education'] = dftest['education'].fillna(dftest['education'].mode()[0])
    dftest['married'] = dftest['married'].fillna(dftest['married'].mode()[0])
    dftest['person_living_in_house'] = dftest['person_living_in_house'].fillna(dftest['person_living_in_house'].mode()[0])
    dftest['vehicle'] = dftest['vehicle'].fillna(dftest['vehicle'].mode()[0])
    dftest['house_rental_fee'] = dftest['house_rental_fee'].fillna(dftest['house_rental_fee'].mode()[0])
    dftest['house_loan_pmt'] = dftest['house_loan_pmt'].fillna(dftest['house_loan_pmt'].mode()[0])
    dftest['transport_spending'] = dftest['transport_spending'].fillna(dftest['transport_spending'].mode()[0])
    dftest['public_transport_spending'] = dftest['public_transport_spending'].fillna(dftest['public_transport_spending'].mode()[0])
    dftest['house_utility'] = dftest['house_utility'].fillna(dftest['house_utility'].mode()[0])
    dftest['food_spending'] = dftest['food_spending'].fillna(dftest['food_spending'].mode()[0])
    dftest['kids_spending'] = dftest['kids_spending'].fillna(dftest['kids_spending'].mode()[0])
    dftest['personal_loan'] = dftest['personal_loan'].fillna(dftest['personal_loan'].mode()[0])
    dftest['education_loan'] = dftest['education_loan'].fillna(dftest['education_loan'].mode()[0])
    dftest['other_loan'] = dftest['other_loan'].fillna(dftest['other_loan'].mode()[0])
    dftest['investment'] = dftest['investment'].fillna(dftest['investment'].mode()[0])
    rename_value = {"race": {"Kree":1, "Sapiens":2, "Skrull":3, "Others":4},
               "gender": {"F":1, "M":2},
               "employment": {"Employed":1, "Government retiree":2, "Self-employed":3, "Private sector retiree":4, "Unemployed":5, "Others":6},
               "education": {"Bachelor's Degree":1,"Diploma":2, "Certificates":3, "High School":4},
               "married": {"No":0, "Yes":1, "Divorcee":2},
               "person_living_in_house": {"10 or more" : 10},
               "salary" : {"1K to 2K":1, "2K to 3K":1, "3K to 4K":1, "4K to 5K":1,"Less than 1K":1, "5K to 6K":2, "7K to 8K":2, "8K to 9K":2, "9K to 10K":2,"10K or more":2},
               "house_value": {"Below 100K":1, "100K to 200K":2, "200K to 300K":3, "300K to 400K":4,"400K to 500K":5, "500K to 600K":6,"600K to 700K":7,"700K to 800K":8,"800K to 1M":9, "More than 1M":10},
               "vehicle": {"Asia brand car":1, "Did not own any vehicle":2, "Europe brand car":3, "Local brand car":4, "Motorcycle":5}}
    dftest = dftest.replace(rename_value)
    dftest = dftest.astype({'person_living_in_house':'int'})
    dftest['person_living_in_house'] = pd.to_numeric(dftest['person_living_in_house'], errors="coerce").fillna(0).astype('int64')
    dftest['age'] = dftest['age'].astype(float)
    dftest['race'] = dftest['race'].astype(float)
    dftest['gender'] = dftest['gender'].astype(float)
    dftest['employment'] = dftest['employment'].astype(float)
    dftest['education'] = dftest['education'].astype(float)
    dftest['married'] = dftest['married'].astype(float)
    dftest['person_living_in_house'] = dftest['person_living_in_house'].astype(float)
    dftest['salary'] = dftest['salary'].astype(float)
    dftest['house_value'] = dftest['house_value'].astype(float)
    dftest['vehicle'] = dftest['vehicle'].astype(float)
    column_names= ["age","race","gender","employment","education","married","person_living_in_house","house_value","vehicle","house_rental_fee","house_loan_pmt","transport_spending","public_transport_spending",
              "house_utility","food_spending","kids_spending","personal_loan","education_loan","other_loan","investment","salary"]
    dftest1=dftest.reindex(columns=column_names)
    
    #assigning x and y values/preparing data for modelling
    # independent variables
    X = df1.iloc[:, :-1].values
    #dependent variable
    y = df1.iloc[:, -1:].values

    X2 = dftest1.iloc[:, :-1].values
    y2 = dftest1.iloc[:, -1:].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    X_train2,X_test2,y_train2,y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=0)
    if(model == 'logisticregression'):
        logreg = LogisticRegression()
        logreg.fit(X_train, y_train)
        y_predict = logreg.predict(X_test2)
        output = print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test2, y_test2)*100),'%')
        
    elif (model == 'linearregression'):
        #split data into training and testing sets
      
        linreg = LinearRegression()
        linreg.fit(X_train, y_train)
        
        
        y_pred = linreg.predict(X_test2)
        output = print("Accuracy on Test Data : {:,.2f}".format(linreg.score(X_test2, y_test2) *100 ) , '%')
        
    return output

## function predictsalary('model','file') 
    - model can be choose from string given: 'logisticregression' or 'linearregression'
    - file need to be uploaded in CSV format and in Jupyter workspace for the function to be working

In [349]:
predictsalary('logisticregression', 'copytest.csv')

Accuracy of logistic regression classifier on test set: 90.12 %


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [350]:
rename_value = {"salary" : {1:'B',2:'M'}}
df = df.replace(rename_value)
df.head(10)

Unnamed: 0,age,race,gender,employment,education,married,person_living_in_house,salary,house_value,vehicle,...,house_loan_pmt,transport_spending,public_transport_spending,house_utility,food_spending,kids_spending,personal_loan,education_loan,other_loan,investment
0,30.0,1.0,2.0,1.0,4.0,1.0,3.0,B,1.0,4.0,...,0.0,250.0,0.0,120.0,300.0,160.0,0.0,0.0,200.0,0.0
1,43.0,2.0,2.0,3.0,1.0,1.0,3.0,M,4.0,1.0,...,950.0,700.0,0.0,350.0,1000.0,500.0,300.0,0.0,0.0,0.0
2,28.0,3.0,1.0,1.0,1.0,1.0,5.0,B,1.0,4.0,...,0.0,600.0,0.0,300.0,600.0,100.0,0.0,0.0,0.0,0.0
3,57.0,1.0,1.0,3.0,4.0,1.0,4.0,B,1.0,5.0,...,0.0,100.0,0.0,200.0,800.0,450.0,0.0,0.0,0.0,0.0
4,51.0,3.0,1.0,5.0,4.0,1.0,5.0,B,1.0,1.0,...,0.0,0.0,0.0,180.0,0.0,0.0,0.0,0.0,0.0,0.0
5,76.0,1.0,2.0,2.0,4.0,1.0,2.0,B,1.0,4.0,...,600.0,100.0,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0
6,63.0,1.0,1.0,5.0,4.0,2.0,3.0,B,6.0,4.0,...,0.0,550.0,0.0,100.0,1000.0,0.0,0.0,0.0,0.0,0.0
7,34.0,1.0,1.0,1.0,1.0,0.0,8.0,B,6.0,4.0,...,2000.0,800.0,0.0,300.0,1000.0,0.0,0.0,200.0,0.0,0.0
8,24.0,1.0,2.0,1.0,2.0,0.0,6.0,B,1.0,4.0,...,0.0,400.0,0.0,200.0,200.0,0.0,0.0,0.0,0.0,0.0
9,59.0,2.0,2.0,1.0,2.0,1.0,4.0,M,7.0,1.0,...,3000.0,0.0,0.0,600.0,4000.0,0.0,0.0,0.0,0.0,0.0
