In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

train_data = pd.read_csv("/kaggle/input/pneumonia/train.csv")
test_data = pd.read_csv("/kaggle/input/pneumonia/test.csv")
#train_data.info()
train_data.head()

In [None]:
sns.countplot(x=train_data['label'])
plt.show()

In [None]:
# check missing values
#np.sum(train_data.isna())
# fill missing values with zeros
#train_datatrain_data.fillna(0)
# fill missing values with previous ones
#train_datatrain_data.fillna(method ='pad')
# fill missing values with means
#train_data = train_data.fillna(train_data.mean())
# fill missing values with means of each class
train_data = train_data.fillna(train_data.groupby('label').transform('mean'))

In [None]:
models = {}
num_training = len(train_data)
num_dev = 10
X_train = train_data.iloc[:, 1:-1].to_numpy()
y_train = train_data.iloc[:, -1].to_numpy()
mask = np.random.choice(num_training, num_dev, replace=False) 
X_dev = X_train[mask]
y_dev = y_train[mask]
X_test =  test_data.iloc[:, 1:].to_numpy()
print(X_train.shape, y_train.shape, X_dev.shape, y_dev.shape)
print(X_test.shape)

## Linear Model

In [None]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression, BayesianRidge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=42) 
# Use cross-validation instead

In [None]:
#pipe = make_pipeline(MinMaxScaler(), LogisticRegression())
sc = StandardScaler()
sc.fit(X_train)
pipe = make_pipeline(sc, LogisticRegression(solver='lbfgs', max_iter=np.inf))
#pipe.fit(X_train, y_train)
score = cross_val_score(pipe, X_train, y_train, cv=10).mean()
print(f'10-fold cross-validation score is {score}')
models['linear'] = pipe

## Nearest Neighbers

In [None]:
#from matplotlib.colors import ListedColormap
from sklearn.neighbors import KNeighborsClassifier

In [None]:
best_model = None
best_score = -np.inf

for n_neighbors in range(1,20):
    knn = KNeighborsClassifier(n_neighbors)
    #pipe = make_pipeline(MinMaxScaler(), knn)
    pipe = make_pipeline(StandardScaler(), knn)
    #pipe.fit(X_train,y_train)
    score = cross_val_score(pipe, X_train, y_train, cv=10).mean()
    #print(score)
    if best_score < score: best_score, best_model = score, pipe 
print(f'10-fold cross-validation score is {best_score}')
models['knn'] = best_model

## Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
n_estimators = [5, 10, 15, 20, 30, 50, 70, 100, 120, 150, 200]
best_model = None
best_score = -np.inf

for num in n_estimators:
    ada = AdaBoostClassifier(n_estimators=num, learning_rate=1)
    #pipe = make_pipeline(MinMaxScaler(),ada)
    pipe = make_pipeline(StandardScaler(), ada)
    #pipe.fit(X_train,y_train)
    score = cross_val_score(pipe, X_train, y_train, cv=10).mean()
    #print(score)
    if best_score < score: best_score, best_model = score, pipe 
print(f'10-fold cross-validation score is {best_score}')
models['ada'] = best_model

## Tree Structure

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
n_estimators = [5, 10, 15, 20, 30, 50, 70, 100, 120, 150, 200]
best_model = None
best_score = -np.inf

for num in n_estimators:
    rf = RandomForestClassifier(n_estimators=num, max_depth=None, max_features=None)
    #pipe = make_pipeline(MinMaxScaler(),rf)
    pipe = make_pipeline(StandardScaler(), rf)
    #pipe.fit(X_train,y_train)
    score = cross_val_score(pipe, X_train, y_train, cv=10).mean()
    #print(score)
    if best_score < score: best_score, best_model = score, pipe 
print(f'10-fold cross-validation score is {best_score}')
models['rf'] = best_model

In [None]:
from lightgbm import LGBMClassifier

In [None]:
n_estimators = [5, 10, 15, 20, 30, 50, 70, 100, 120, 150, 200]
best_model = None
best_score = -np.inf

for num in n_estimators:
    lgbm = LGBMClassifier(num_leaves=50, max_depth=-1, n_estimators=num)
    #pipe = make_pipeline(MinMaxScaler(), lgbm)
    pipe = make_pipeline(StandardScaler(), lgbm)
    #pipe.fit(X_train,y_train)
    score = cross_val_score(pipe, X_train, y_train, cv=10).mean()
    #print(score)
    if best_score < score: best_score, best_model = score, pipe 
print(f'10-fold cross-validation score is {best_score}')
models['lgbm'] = best_model

## Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
reg_strengths = [3.5, 3, 2.5, 2, 1.5, 1, 0.5, 0.1, 0.05]
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
best_model = None 
best_score = -np.inf

for reg in reg_strengths:
    for kernel in kernels:
        svc = SVC( C=reg, kernel=kernel)
        #pipe = make_pipeline(MinMaxScaler(), svc)
        pipe = make_pipeline(StandardScaler(), svc)
        #pipe.fit(X_train,y_train)
        score = cross_val_score(pipe, X_train, y_train, cv=10).mean()
        #print(score)
        if best_score < score: best_score, best_model = score, pipe 
        
print(f'10-fold cross-validation score is {best_score}')
models['svm'] = best_model

## Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
n_units = [5, 10, 20, 30, 50, [10, 5], [20, 5], [5, 5]]
reg_strengths = [3.5, 3, 2.5, 2, 1.5]
best_model = None
best_score = -np.inf

for num in n_units:
    for reg in reg_strengths:
        mlp = MLPClassifier(solver='lbfgs', max_iter=np.inf, alpha=reg, hidden_layer_sizes=(num))
        pipe = make_pipeline(MinMaxScaler(), lgbm)
        pipe = make_pipeline(StandardScaler(), mlp)
        pipe.fit(X_train,y_train)
        score = cross_val_score(pipe, X_train, y_train, cv=10).mean()
        #print(num, reg, score)
        if best_score < score: best_score, best_model = score, pipe 
print(f'10-fold cross-validation score is {best_score}')
models['mlp'] = best_model

In [None]:
models

In [None]:
models['linear'].fit(X_train, y_train)
ans1 = models['linear'].predict(X_test)
models['knn'].fit(X_train, y_train)
ans2 = models['knn'].predict(X_test)
models['ada'].fit(X_train, y_train)
ans3 = models['ada'].predict(X_test)
models['rf'].fit(X_train, y_train)
ans4 = models['rf'].predict(X_test)
models['lgbm'].fit(X_train, y_train)
ans5 = models['lgbm'].predict(X_test)
models['svm'].fit(X_train, y_train)
ans6 = models['svm'].predict(X_test)
models['mlp'].fit(X_train, y_train)
ans7 = models['mlp'].predict(X_test)

In [None]:
data = {'id': test_data['id'].values, 'linear': ans1, 'knn': ans2, 'ada': ans3, 'rf': ans4, 'lgbm': ans5, 'svm': ans6, 'mlp': ans7}  
# Create DataFrame  
df = pd.DataFrame(data) 

In [None]:
df

In [None]:
data = {'id': test_data['id'].values, 'label': ans5}  
  
# Create DataFrame  
df = pd.DataFrame(data)  

In [None]:
df.to_csv('submission.csv', index=False)