In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
import seaborn as sns
import warnings 

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

In [2]:
''' reading dataset  df1, df2, df3 '''
df1 = pd.read_csv('Train/First_Health_Camp_Attended.csv')
df2 = pd.read_csv('Train/Second_Health_Camp_Attended.csv')
df3 = pd.read_csv('Train/Third_Health_Camp_Attended.csv')

In [3]:
''' displaying first 5 rows of df1 '''
df1.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Donation,Health_Score,Unnamed: 4
0,506181,6560,40,0.439024,
1,494977,6560,20,0.097561,
2,518680,6560,10,0.04878,
3,509916,6560,30,0.634146,
4,488006,6560,20,0.02439,


In [4]:
''' displaying first 5 rows of df2 '''
df2.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Health Score
0,526631,6536,0.875136
1,509122,6536,0.7557
2,498864,6536,0.673181
3,515398,6536,0.722041
4,504624,6536,0.464712


In [5]:
''' displaying first 5 rows of df3 '''
df3.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Number_of_stall_visited,Last_Stall_Visited_Number
0,517875,6527,3,1
1,504692,6578,1,1
2,504692,6527,3,1
3,493167,6527,4,4
4,510954,6528,2,2


In [6]:
''' shape of df1 '''
df1.shape

(6218, 5)

In [7]:
''' shape of df2 '''
df2.shape

(7819, 3)

In [8]:
''' shape of df3 '''
df3.shape

(6515, 4)

In [10]:
''' dropping 'Donation', 'Unnamed: 4' from df1 '''
df1.drop(['Donation', 'Unnamed: 4'], axis=1, inplace=True)

In [11]:
''' dropping Last_Stall_Visited_Number from df3 '''
df3.drop(['Last_Stall_Visited_Number'], axis=1, inplace=True)

In [12]:
h1 = []
i = 0
for i in range(0, df1.shape[0]):
    if df1['Health_Score'][i] > df1['Health_Score'].mean():
        h1.append(1)
    else:
        h1.append(0)

In [13]:
h2 = []
j = 0
for j in range(0, df2.shape[0]):
    if df2['Health Score'][j] > df2['Health Score'].mean():
        h2.append(1)
    else:
        h2.append(0)

In [14]:
h3 = []
k = 0
for k in range(0, df3.shape[0]):
    if df3['Number_of_stall_visited'][k] == 0:
        h3.append(0)
    else:
        h3.append(1)

In [15]:
''' adding new column in each df '''
df1['hs_binary'] = h1
df2['hs_binary'] = h2
df3['hs_binary'] = h3

In [16]:
''' dropping some columns from each df '''
df1 = df1.drop('Health_Score', axis=1)
df1_copy = df2.copy()

df2 = df2.drop('Health Score', axis=1)
df2_copy = df2.copy()

df3 = df3.drop('Number_of_stall_visited', axis=1)
df3_copy = df3.copy()

In [17]:
''' concatening all df'''
df = pd.concat([df1_copy, df2_copy, df3_copy], axis=0).reset_index().drop('index', axis=1)

In [18]:
''' reading new data '''
train_df = pd.read_csv('Train/Train.csv')

In [19]:
''' displaying data of train_df '''
train_df.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5
0,489652,6578,10-Sep-05,4,0,0,0,2
1,507246,6578,18-Aug-05,45,5,0,0,7
2,523729,6534,29-Apr-06,0,0,0,0,0
3,524931,6535,07-Feb-04,0,0,0,0,0
4,521364,6529,28-Feb-06,15,1,0,0,7


In [21]:
''' concatenating train_df and  df'''
df_new = pd.merge(df, train_df, on=['Patient_ID', 'Health_Camp_ID'], how='inner')

In [22]:
df_new.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Health Score,hs_binary,Registration_Date,Var1,Var2,Var3,Var4,Var5
0,526631,6536,0.875136,1,,0,0,0,0,0
1,526631,6536,,1,,0,0,0,0,0
2,509122,6536,0.7557,1,13-Feb-05,1,0,0,0,1
3,509122,6536,,1,13-Feb-05,1,0,0,0,1
4,498864,6536,0.673181,1,13-Jan-05,1,0,0,0,1


In [23]:
''' checking null values '''
df_new.isnull().sum()

Patient_ID               0
Health_Camp_ID           0
Health Score         14334
hs_binary                0
Registration_Date      301
Var1                     0
Var2                     0
Var3                     0
Var4                     0
Var5                     0
dtype: int64

In [24]:
''' dropping null values '''
df_ = df_new.dropna(axis=0).reset_index().drop('index', axis=1)

In [25]:
''' checking null values '''
df_.isnull().sum()

Patient_ID           0
Health_Camp_ID       0
Health Score         0
hs_binary            0
Registration_Date    0
Var1                 0
Var2                 0
Var3                 0
Var4                 0
Var5                 0
dtype: int64

In [26]:
''' converting registraion_date column into datetime'''
date = pd.to_datetime(df_['Registration_Date'], format='%d-%b-%y')

In [28]:
m = []
for i in range(0, df_.shape[0]):
    m.append(date[i].month)

In [29]:
''' adding new column in df_ '''
df_['Months'] = m

In [30]:
df_.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Health Score,hs_binary,Registration_Date,Var1,Var2,Var3,Var4,Var5,Months
0,509122,6536,0.7557,1,13-Feb-05,1,0,0,0,1,2
1,498864,6536,0.673181,1,13-Jan-05,1,0,0,0,1,1
2,515398,6536,0.722041,1,10-Feb-05,0,0,0,0,0,2
3,504624,6536,0.464712,0,17-Feb-05,0,0,0,0,0,2
4,486444,6536,0.587405,1,13-Feb-05,0,0,0,0,0,2


In [31]:
''' dropping registraion_date from df_ '''
df_ = df_.drop('Registration_Date', axis=1)

In [33]:
'''taking important columns '''
data = df_[['Patient_ID', 'Health_Camp_ID', 'Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Months', 'hs_binary']]

In [34]:
''' independent and dependent features '''
X = data.iloc[:, 2:-1]
y = data.iloc[:, -1:]

In [35]:
X.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Months
0,1,0,0,0,1,2
1,1,0,0,0,1,1
2,0,0,0,0,0,2
3,0,0,0,0,0,2
4,0,0,0,0,0,2


In [36]:
y.head()

Unnamed: 0,hs_binary
0,1
1,1
2,1
3,0
4,1


In [54]:
class cs(BaseEstimator, TransformerMixin): 
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.sc = StandardScaler(copy,with_mean,with_std)
        self.c = columns
        self.m = None
        self.v = None
    
    def fit(self, X, y=None):
        self.sc.fit(X[self.c], y)
        self.m = np.mean(X[self.c])
        self.v = np.var(X[self.c])
        return self

    def transform(self, X, y=None, copy=None):
        col_order = X.columns
        X_sc = pd.DataFrame(self.sc.transform(X[self.c]), columns=self.c)

        X_not_sc = X.loc[:,~X.columns.isin(self.c)]

        return pd.concat([X_not_sc, X_sc], axis=1)[col_order]

In [55]:
cols = ['Months']
cols_sc = [x for x in X.columns.values if x not in cols]

In [58]:
hc_sc = cs(cols_sc)

''' fit on data '''
hc_sc.fit(X)

sc_inp = hc_sc.transform(X)

In [59]:
''' train test split '''
X_train, X_test, y_train, y_test = train_test_split(sc_inp, y, train_size=0.75, random_state=0)

In [60]:
print("X_train shape: ", X_train.shape)
print("X_test shape : ", X_test.shape)

X_train shape:  (5751, 6)
X_test shape :  (1918, 6)


In [62]:
''' logistic regression '''
lg = LogisticRegression()

''' fit on data '''
lg.fit(X_train, y_train)

LogisticRegression()

In [63]:
print(lg.score(X_train, y_train))

0.5338202051817075


In [72]:
''' reading test data '''
test_df = pd.read_csv('Train/test.csv')

In [73]:
''' displaying first 5 rows '''
test_df.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5
0,505701,6548,21-May-06,1,0,0,0,2
1,500633,6584,02-Jun-06,0,0,0,0,0
2,506945,6582,10-Aug-06,0,0,0,0,0
3,497447,6551,27-Aug-06,0,0,0,0,0
4,496446,6533,19-Sep-06,0,0,0,0,0


In [74]:
''' converting registration date into datetime column '''
date2 = pd.to_datetime(test_df['Registration_Date'], format='%d-%b-%y')

In [75]:
m2 = []
for i in range(0, test_df.shape[0]):
    m2.append(date2[i].month)

In [77]:
test_df.head()

Unnamed: 0,Patient_ID,Health_Camp_ID,Registration_Date,Var1,Var2,Var3,Var4,Var5
0,505701,6548,21-May-06,1,0,0,0,2
1,500633,6584,02-Jun-06,0,0,0,0,0
2,506945,6582,10-Aug-06,0,0,0,0,0
3,497447,6551,27-Aug-06,0,0,0,0,0
4,496446,6533,19-Sep-06,0,0,0,0,0


In [78]:
''' creating new column months '''
test_df['Months'] = m2

''' taking important features '''
test_df = test_df[['Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Months']]

In [79]:
''' prediction '''
pred = lg.predict_proba(test_df)

In [81]:
pred = np.argmax(y_test, axis=1)

In [84]:
print("Accuracy Score: ", metrics.accuracy_score(y_test, pred))

Accuracy Score:  0.5375391032325338
