In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn import datasets, linear_model
from sklearn.metrics import confusion_matrix, f1_score

In [2]:
df_train = pd.read_csv('Real_Train_chr_data.csv')

In [3]:
X = df_train.drop('donr', axis=1)
y = df_train[['ID', 'donr']]

In [4]:
X.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
X = pd.get_dummies(X, columns=["chld"])

In [6]:
X = pd.get_dummies(X, columns=["wrat"])

In [7]:
scaler = MinMaxScaler(copy=True)
f_feat = PolynomialFeatures(degree=5)
avhv = X['avhv'].to_numpy()
incm = X['incm'].to_numpy()
inca = X['inca'].to_numpy()

plow = X['plow'].to_numpy()
npro = X['npro'].to_numpy()
tgif = X['tgif'].to_numpy()

lgif = X['lgif'].to_numpy()
rgif = X['rgif'].to_numpy()
tdon = X['tdon'].to_numpy()

tlag = X['tlag'].to_numpy()
agif = X['agif'].to_numpy()

X.drop(columns=['avhv', 'incm', 'inca', 'plow', 'npro', 'tgif', 'lgif', 'rgif', 'tdon', 'tlag', 'agif'], inplace=True)

avhv = scaler.fit_transform(avhv.reshape(-1, 1))
incm = scaler.fit_transform(incm.reshape(-1, 1))
inca = scaler.fit_transform(inca.reshape(-1, 1))

plow = scaler.fit_transform(plow.reshape(-1, 1))
npro = scaler.fit_transform(npro.reshape(-1, 1))
tgif = scaler.fit_transform(tgif.reshape(-1, 1))

lgif = scaler.fit_transform(lgif.reshape(-1, 1))
rgif = scaler.fit_transform(rgif.reshape(-1, 1))
tdon = scaler.fit_transform(tdon.reshape(-1, 1))

tlag = scaler.fit_transform(tlag.reshape(-1, 1))
agif = scaler.fit_transform(agif.reshape(-1, 1))

avhv = f_feat.fit_transform(avhv)
incm = f_feat.fit_transform(incm)
inca = f_feat.fit_transform(inca)

plow = f_feat.fit_transform(plow)
npro = f_feat.fit_transform(npro)
tgif = f_feat.fit_transform(tgif)

lgif = f_feat.fit_transform(lgif)
rgif = f_feat.fit_transform(rgif)
tdon = f_feat.fit_transform(tdon)

tlag = f_feat.fit_transform(tlag)
agif = f_feat.fit_transform(agif)

In [8]:
X.head()

Unnamed: 0,ID,reg1,reg2,reg3,reg4,home,hinc,genf,chld_0,chld_1,...,wrat_0,wrat_1,wrat_2,wrat_3,wrat_4,wrat_5,wrat_6,wrat_7,wrat_8,wrat_9
0,1836,0,0,0,0,1,4,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,6494,0,0,0,1,1,4,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,113,0,1,0,0,1,2,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2487,0,0,0,0,1,4,1,0,1,...,0,0,0,0,0,0,1,0,0,0
4,2153,0,1,0,0,1,7,1,1,0,...,0,0,0,0,0,0,0,0,1,0


In [9]:
X_train = np.hstack([incm, inca, npro, tgif, lgif, rgif, tdon, tlag, agif, X.to_numpy()])

In [10]:
X_train.shape

(4201, 78)

In [11]:
X_train_part, X_test_part, Y_train_part, Y_test_part = train_test_split(
    X_train, y, test_size=0.30, random_state=50)

In [12]:
regr = linear_model.LogisticRegression()

In [13]:
regr.fit(X_train_part, Y_train_part['donr'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
y_pred = regr.predict(X_test_part)

In [15]:
df_test = pd.read_csv('Real_Test_chr_data.csv')

In [16]:
df_test.head()

Unnamed: 0.1,Unnamed: 0,ID,reg1,reg2,reg3,reg4,home,chld,hinc,genf,...,incm,inca,plow,npro,tgif,lgif,rgif,tdon,tlag,agif
0,1782,3503,1,0,0,0,1,3,6,1,...,25,41,22,62,86,11,14,17,10,6.41
1,3917,7875,0,1,0,0,1,0,4,1,...,47,47,14,12,41,23,13,15,3,14.55
2,221,484,0,0,0,0,1,2,2,1,...,76,76,2,67,116,67,45,21,4,21.99
3,2135,4240,0,0,0,0,1,3,4,1,...,29,42,20,41,50,27,20,15,15,14.15
4,1841,7349,0,0,1,0,1,5,2,1,...,72,72,2,78,114,17,24,23,6,7.66


In [17]:
confusion_matrix(Y_test_part['donr'], y_pred)

array([[551,  97],
       [ 90, 523]], dtype=int64)

In [18]:
f1_score(Y_test_part['donr'], y_pred)

0.8483373884833738

In [20]:
X_test = df_test
X_test.drop('Unnamed: 0', axis=1, inplace=True)
X_test = pd.get_dummies(X_test, columns=["chld"])
X_test = pd.get_dummies(X_test, columns=["wrat"])
scaler = MinMaxScaler(copy=True)
f_feat = PolynomialFeatures(degree=5)
avhv = X_test['avhv'].to_numpy()
incm = X_test['incm'].to_numpy()
inca = X_test['inca'].to_numpy()

plow = X_test['plow'].to_numpy()
npro = X_test['npro'].to_numpy()
tgif = X_test['tgif'].to_numpy()

lgif = X_test['lgif'].to_numpy()
rgif = X_test['rgif'].to_numpy()
tdon = X_test['tdon'].to_numpy()

tlag = X_test['tlag'].to_numpy()
agif = X_test['agif'].to_numpy()

X_test.drop(columns=['avhv', 'incm', 'inca', 'plow', 'npro', 'tgif', 'lgif', 'rgif', 'tdon', 'tlag', 'agif'], inplace=True)

avhv = scaler.fit_transform(avhv.reshape(-1, 1))
incm = scaler.fit_transform(incm.reshape(-1, 1))
inca = scaler.fit_transform(inca.reshape(-1, 1))

plow = scaler.fit_transform(plow.reshape(-1, 1))
npro = scaler.fit_transform(npro.reshape(-1, 1))
tgif = scaler.fit_transform(tgif.reshape(-1, 1))

lgif = scaler.fit_transform(lgif.reshape(-1, 1))
rgif = scaler.fit_transform(rgif.reshape(-1, 1))
tdon = scaler.fit_transform(tdon.reshape(-1, 1))

tlag = scaler.fit_transform(tlag.reshape(-1, 1))
agif = scaler.fit_transform(agif.reshape(-1, 1))

avhv = f_feat.fit_transform(avhv)
incm = f_feat.fit_transform(incm)
inca = f_feat.fit_transform(inca)

plow = f_feat.fit_transform(plow)
npro = f_feat.fit_transform(npro)
tgif = f_feat.fit_transform(tgif)

lgif = f_feat.fit_transform(lgif)
rgif = f_feat.fit_transform(rgif)
tdon = f_feat.fit_transform(tdon)

tlag = f_feat.fit_transform(tlag)
agif = f_feat.fit_transform(agif)

In [21]:
X_test_ = np.hstack([incm, inca, npro, tgif, lgif, rgif, tdon, tlag, agif, X_test.to_numpy()])

In [22]:
y_fin = regr.predict(X_test_)

In [23]:
y_fin

array([0., 1., 0., ..., 1., 0., 1.])

In [31]:
y_fin_ = pd.DataFrame(y_fin, columns=['donr'])
filename = '201217_Efremova_1.csv'
y_fin_.to_csv(filename, index=None)