In [48]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = pd.read_csv('t1.csv', delimiter = ',').values
x = data[:,:2]
y = data[:,:-1].astype(int)

train = np.random.choice([True, False], len(data), replace=True, p=[0.9, 0.1])
x_train = data[train,:2]
y_train = data[train,-1].astype(int)
x_test = data[~train,:2]
y_test = data[~train,-1].astype(int)


In [49]:
logit = linear_model.LogisticRegression(solver = 'liblinear')
logit.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [50]:
print(logit.score(x_test,y_test))
pred_y = logit.predict(x_train)
accuracy_score(pred_y, y_train)

0.5943439962911451


0.590758068890697

In [25]:
logit.coef_

array([[-13.46009488, -11.8617987 ]])

In [4]:
data = pd.read_csv('t2-smoking.csv', delimiter = ',').values
x = data[:,:2]
y = data[:,2].astype(int)

train = np.random.choice([True, False], len(data), replace=True, p=[0.9, 0.1])
x_train = data[train,:2]
y_train = data[train,2].astype(int)
x_test = data[~train,:2]
y_test = data[~train,2].astype(int)

logit.fit(x_train, y_train)
logit.score(x_test,y_test)

0.924790236460717

In [16]:


def try_test(file):
    data = pd.read_csv(file, delimiter = ',').values
    x = data[:,:-1]
    y = data[:,-1]

    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

    logit.fit(x_train, y_train)
    y_pred = logit.predict(x_test)
    print(len(y_pred))
    
    return accuracy_score(y_pred,y_test)

In [6]:
try_test('t2-smoking.csv')

13093


0.9283586649354617

In [7]:
try_test('t2-diff.csv')

13093


0.8880317727029711

In [8]:
try_test('t2.csv')

13093


0.9266783777591079

In [9]:
try_test('t4.csv')

13093


0.8838310547620866

In [10]:
try_test('t3.csv')

13093


0.8903230733979989

In [11]:
filenames = [#"t1-diff.csv",
"t1-healthy.csv",
"t1-non-exercises.csv",
"t1-non-smokers.csv",
"t1-only-females.csv",
"t1-only-males.csv",
"t1-smoking.csv",
"t1-unhealthy.csv",
"t1.csv",
"t2-diff.csv",
"t2-healthy.csv",
"t2-non-exercises.csv",
"t2-non-smokers.csv",
"t2-only-females.csv",
"t2-only-males.csv",
"t2-smoking.csv",
"t2-unhealthy.csv",
"t2.csv",
"t3-diff.csv",
"t3-healthy.csv",
"t3-non-exercises.csv",
"t3-non-smokers.csv",
"t3-only-females.csv",
"t3-only-males.csv",
"t3-smoking.csv",
"t3-unhealthy.csv",
"t3.csv"]

In [18]:
for file in filenames:
    print(file, try_test(file))
    

9290
t1-healthy.csv 0.726264800861141
2562
t1-non-exercises.csv 0.6943793911007026
11947
t1-non-smokers.csv 0.7232778103289529
8515
t1-only-females.csv 0.7133294186729301
4579
t1-only-males.csv 0.7246123607774624
13093
t1-smoking.csv 0.7219124723134499
3804
t1-unhealthy.csv 0.7108307045215563
13093
t1.csv 0.7221416023829528
13093
t2-diff.csv 0.6915145497594134
9290
t2-healthy.csv 0.7314316469321851
2562
t2-non-exercises.csv 0.695160031225605
11947
t2-non-smokers.csv 0.7175860048547753
8515
t2-only-females.csv 0.7196711685261303
4579
t2-only-males.csv 0.7182791002402271
13093
t2-smoking.csv 0.7159550905063774
3804
t2-unhealthy.csv 0.7121451104100947
13093
t2.csv 0.7176353776827312
13093
t3-diff.csv 0.6961735278393034
9290
t3-healthy.csv 0.7271259418729817
2562
t3-non-exercises.csv 0.6459797033567526
11947
t3-non-smokers.csv 0.713317150749142
8515
t3-only-females.csv 0.723429242513212
4579
t3-only-males.csv 0.7300720681371479
13093
t3-smoking.csv 0.7293973879172077
3804
t3-unhealthy.csv 

In [53]:
from sklearn import preprocessing

def try_test2(file):
    data = pd.read_csv(file, delimiter = ',').values
    x = data[:,:-1]
    y = data[:,-1]
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)
    
    poly2 = preprocessing.PolynomialFeatures(2)
    poly3 = preprocessing.PolynomialFeatures(3)

    x2 = poly2.fit_transform(x_train)
    x3 = poly3.fit_transform(x_train)

    x2_test = poly2.fit_transform(x_test)
    x3_test = poly3.fit_transform(x_test)
    
    print(file)
    logit.fit(x_train, y_train)
    print("\tPolynomial ^ 1, score is,", logit.score(x_test,y_test))
    logit.fit(x2, y_train)
    print("\tPolynomial ^ 2, score is,", logit.score(x2_test,y_test))
    logit.fit(x3, y_train)
    print("\tPolynomial ^ 3, score is,", logit.score(x3_test,y_test))
    

In [54]:
for file in filenames:
    try_test2(file)
    

t1-healthy.csv
	Polynomial ^ 1, score is, 0.7215285252960172
	Polynomial ^ 2, score is, 0.7180839612486545
	Polynomial ^ 3, score is, 0.7168998923573735
t1-healthy.csv None
t1-non-exercises.csv
	Polynomial ^ 1, score is, 0.7103825136612022
	Polynomial ^ 2, score is, 0.7056986729117877
	Polynomial ^ 3, score is, 0.6990632318501171
t1-non-exercises.csv None
t1-non-smokers.csv
	Polynomial ^ 1, score is, 0.7155771323344773
	Polynomial ^ 2, score is, 0.713317150749142
	Polynomial ^ 3, score is, 0.7092994057085461
t1-non-smokers.csv None
t1-only-females.csv
	Polynomial ^ 1, score is, 0.7240164415736935
	Polynomial ^ 2, score is, 0.7179095713446858
	Polynomial ^ 3, score is, 0.7118027011156782
t1-only-females.csv None
t1-only-males.csv
	Polynomial ^ 1, score is, 0.7158768290019655
	Polynomial ^ 2, score is, 0.7163136055907403
	Polynomial ^ 3, score is, 0.7115090631142171
t1-only-males.csv None
t1-smoking.csv
	Polynomial ^ 1, score is, 0.7221416023829528
	Polynomial ^ 2, score is, 0.7200794317

In [68]:

def try_test3(file):
    data = pd.read_csv(file, delimiter = ',').values
    x = data[:,:-1]
    y = data[:,-1]
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)
    
    regr = linear_model.LinearRegression()
    
    
    
    regr.fit(x_train, y_train)
    print(regr.coef_)
    print(file)
    print("\t", regr.score(x_test,y_test))

In [69]:
for file in filenames:
    try_test3(file)

[-9.61678992e-02 -1.78151559e-02  1.13983055e-01  1.79613975e-02
  2.24647960e-02 -4.04261935e-02  2.78443210e-05  7.19034677e-03
 -1.03193999e-03  1.97915821e-03  1.00265695e-02  2.47644471e-03]
t1-healthy.csv
	 0.2307944645272878
[-8.07896223e-02  7.94717960e-03  7.28424427e-02  1.83780537e-02
  4.43329226e-03 -2.28113459e-02  3.43706953e-05 -9.40247448e-03
 -3.63114471e-04  2.61079780e-03  8.60198586e-03  2.32103119e-03
 -3.18913855e-03 -5.63386897e-02]
t1-non-exercises.csv
	 0.20103286873883608
[-9.19539170e-02 -1.43230830e-02  1.06277000e-01  1.64252742e-02
  2.04270150e-02 -3.68522892e-02  2.91136306e-05 -2.62975296e-03
 -9.26004153e-04  2.00281443e-03  9.79374923e-03  2.29505357e-03
 -3.45694615e-02 -4.27704174e-02]
t1-non-smokers.csv
	 0.22768986600560825
[-9.27980685e-02 -1.87721476e-02  1.11570216e-01  1.76452407e-02
  1.71648990e-02 -3.48101397e-02  3.01460422e-05 -7.37708881e-04
  1.81986479e-03  9.83667633e-03  2.27145813e-03 -8.67388922e-03
 -2.81447915e-02 -3.72111019e-0