# Task 1

In [None]:
from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

# Classifier algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [None]:
X, y = make_classification(n_samples = 2000, n_features = 20, n_informative = 12, 
                           n_redundant = 5, random_state = 7)
print(X.shape, y.shape)

## LogisticRegression with static parameters

In [None]:
steps = list()
steps.append(('scaler', MinMaxScaler()))
steps.append(('model', LogisticRegression()))
pipeline = Pipeline(steps = steps)

rskf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

accur = cross_val_score(pipeline, X, y, scoring='accuracy', cv = rskf, n_jobs = -1)
print('Accuracy is equal to: %.3f' % (mean(accur) * 100))

## LogisticRegression with dynamic number of splits

In [None]:
X, y = make_classification(n_samples = 2000, n_features = 20, n_informative = 12, 
                           n_redundant = 5, random_state = 7)
print(X.shape, y.shape)

steps = list()
steps.append(('scaler', MinMaxScaler()))
steps.append(('model', LogisticRegression()))
pipeline = Pipeline(steps = steps)
best_score_splits = 0
best_number_of_splits = 5
for splits in range(5, 16):
    rskf = RepeatedStratifiedKFold(n_splits = splits, n_repeats = 3, random_state = 1)
    accur = cross_val_score(pipeline, X, y, scoring='accuracy', cv = rskf, n_jobs = -1)
    if (mean(accur)*100) > best_score_splits:
        best_number_of_splits = splits
        best_score_splits = round(mean(accur)*100, 3)
        
    print('Accuracy is equal to: %.3f' % (mean(accur) * 100) + ' - number of splits: ' + str(splits))
print(f"With {best_number_of_splits} splits we've got the best accuracy, which is {best_score_splits}")

## LogisticRegression with dynamic number of repeats

In [None]:
best_score_repeats = 0
best_number_of_repeats = 3
for repeat in range(3, 10):
    rskf = RepeatedStratifiedKFold(n_splits = 11, n_repeats = repeat, random_state = 1)
    accur = cross_val_score(pipeline, X, y, scoring='accuracy', cv = rskf, n_jobs = -1)
    if (mean(accur)*100) > best_score_repeats:
        best_number_of_repeats = repeat
        best_score_repeats = round(mean(accur)*100, 3)
    print('Accuracy is equal to: %.3f' % (mean(accur) * 100) + ' - number of repeats: ' + str(repeat))

print(f"With {best_number_of_repeats} splits we've got the best accuracy, which is {best_score_repeats}")

## Using RandomForestClassifier

In [6]:
steps = list()
steps.append(('scaler', MinMaxScaler()))
steps.append(('model', RandomForestClassifier()))
pipeline = Pipeline(steps = steps)

rskf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

accur = cross_val_score(pipeline, X, y, scoring='accuracy', cv = rskf, n_jobs = -1)
print('Accuracy is equal to: %.3f' % (mean(accur) * 100))

Accuracy is equal to: 94.500


## Using LinearDiscriminantAnalysis

In [7]:
steps = list()
steps.append(('scaler', MinMaxScaler()))
steps.append(('model', LinearDiscriminantAnalysis()))
pipeline = Pipeline(steps = steps)

rskf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

accur = cross_val_score(pipeline, X, y, scoring='accuracy', cv = rskf, n_jobs = -1)
print('Accuracy is equal to: %.3f' % (mean(accur) * 100))

Accuracy is equal to: 90.450


## Using KNeighborsClassifier

In [8]:

steps = list()
steps.append(('scaler', MinMaxScaler()))
steps.append(('model', KNeighborsClassifier()))
pipeline = Pipeline(steps = steps)

rskf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

accur = cross_val_score(pipeline, X, y, scoring='accuracy', cv = rskf, n_jobs = -1)
print('Accuracy is equal to: %.3f' % (mean(accur) * 100))

Accuracy is equal to: 95.683


## Using GaussianNB

In [9]:
steps = list()
steps.append(('scaler', MinMaxScaler()))
steps.append(('model', GaussianNB()))
pipeline = Pipeline(steps = steps)

rskf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

accur = cross_val_score(pipeline, X, y, scoring='accuracy', cv = rskf, n_jobs = -1)
print('Accuracy is equal to: %.3f' % (mean(accur) * 100))

Accuracy is equal to: 89.367


## Using DecisionTreeClassifier

In [10]:
steps = list()
steps.append(('scaler', MinMaxScaler()))
steps.append(('model', DecisionTreeClassifier()))
pipeline = Pipeline(steps = steps)

rskf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

accur = cross_val_score(pipeline, X, y, scoring='accuracy', cv = rskf, n_jobs = -1)
print('Accuracy is equal to: %.3f' % (mean(accur) * 100))

Accuracy is equal to: 85.967


## Using SVC

In [11]:
steps = list()
steps.append(('scaler', MinMaxScaler()))
steps.append(('model', SVC()))
pipeline = Pipeline(steps = steps)

rskf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)

accur = cross_val_score(pipeline, X, y, scoring='accuracy', cv = rskf, n_jobs = -1)
print('Accuracy is equal to: %.3f' % (mean(accur) * 100))

Accuracy is equal to: 96.800


# Looping models

In [27]:
models = [
    LogisticRegression(),
    RandomForestClassifier(),
    LinearDiscriminantAnalysis(),
    KNeighborsClassifier(),
    GaussianNB(),
    DecisionTreeClassifier(),
    SVC()
]

In [28]:
best_model_name = ''
best_score = 0

for current_model in models:
    steps = list()
    steps.append(('scaler', MinMaxScaler()))
    steps.append(('model', current_model))
    pipeline = Pipeline(steps = steps)
    rskf = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 1)
    accur = cross_val_score(pipeline, X, y, scoring='accuracy', cv = rskf, n_jobs = -1)
    print('Accuracy of', current_model.__class__.__name__, 'model is equal to: %.3f' % (mean(accur) * 100))
    if mean(accur*100) > best_score:
        best_model_name = current_model.__class__.__name__
        best_score = round(mean(accur)*100, 3)
print(f"\nThe best model is {best_model_name} with score equal to: {best_score}")

Accuracy of LogisticRegression model is equal to: 91.317
Accuracy of RandomForestClassifier model is equal to: 94.383
Accuracy of LinearDiscriminantAnalysis model is equal to: 90.450
Accuracy of KNeighborsClassifier model is equal to: 95.683
Accuracy of GaussianNB model is equal to: 89.367
Accuracy of DecisionTreeClassifier model is equal to: 86.183
Accuracy of SVC model is equal to: 96.800

The best model is SVC with score equal to: 96.8


# Task 2: 

In [20]:
import pandas as pd

df = pd.read_csv('oil-spill.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,1,2558,1506.09,456.63,90,6395000.0,40.88,7.89,29780.0,0.19,...,2850.0,1000.0,763.16,135.46,3.73,0,33243.19,65.74,7.95,1
1,2,22325,79.11,841.03,180,55812500.0,51.11,1.21,61900.0,0.02,...,5750.0,11500.0,9593.48,1648.8,0.6,0,51572.04,65.73,6.26,0
2,3,115,1449.85,608.43,88,287500.0,40.42,7.34,3340.0,0.18,...,1400.0,250.0,150.0,45.13,9.33,1,31692.84,65.81,7.84,1
3,4,1201,1562.53,295.65,66,3002500.0,42.4,7.97,18030.0,0.19,...,6041.52,761.58,453.21,144.97,13.33,1,37696.21,65.67,8.07,1
4,5,312,950.27,440.86,37,780000.0,41.43,7.03,3350.0,0.17,...,1320.04,710.63,512.54,109.16,2.58,0,29038.17,65.66,7.35,0


In [21]:
df.columns

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
           dtype='int64')

In [22]:
df.set_index(0)

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,40,41,42,43,44,45,46,47,48,49
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2558,1506.09,456.63,90,6395000.0,40.88,7.89,29780.0,0.19,214.7,...,2850.00,1000.00,763.16,135.46,3.73,0,33243.19,65.74,7.95,1
2,22325,79.11,841.03,180,55812500.0,51.11,1.21,61900.0,0.02,901.7,...,5750.00,11500.00,9593.48,1648.80,0.60,0,51572.04,65.73,6.26,0
3,115,1449.85,608.43,88,287500.0,40.42,7.34,3340.0,0.18,86.1,...,1400.00,250.00,150.00,45.13,9.33,1,31692.84,65.81,7.84,1
4,1201,1562.53,295.65,66,3002500.0,42.40,7.97,18030.0,0.19,166.5,...,6041.52,761.58,453.21,144.97,13.33,1,37696.21,65.67,8.07,1
5,312,950.27,440.86,37,780000.0,41.43,7.03,3350.0,0.17,232.8,...,1320.04,710.63,512.54,109.16,2.58,0,29038.17,65.66,7.35,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,12,92.42,364.42,135,97200.0,59.42,10.34,884.0,0.17,110.0,...,381.84,254.56,84.85,146.97,4.50,0,2593.50,65.85,6.39,0
201,11,98.82,248.64,159,89100.0,59.64,10.18,831.0,0.17,107.2,...,284.60,180.00,150.00,51.96,1.90,0,4361.25,65.70,6.53,0
202,14,25.14,428.86,24,113400.0,60.14,17.94,847.0,0.30,133.9,...,402.49,180.00,180.00,0.00,2.24,0,2153.05,65.91,6.12,0
203,10,96.00,451.30,68,81000.0,59.90,15.01,831.0,0.25,97.5,...,402.49,180.00,90.00,73.48,4.47,0,2421.43,65.97,6.32,0


In [23]:
df[50] = 50
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,1,2558,1506.09,456.63,90,6395000.0,40.88,7.89,29780.0,0.19,...,1000.0,763.16,135.46,3.73,0,33243.19,65.74,7.95,1,50
1,2,22325,79.11,841.03,180,55812500.0,51.11,1.21,61900.0,0.02,...,11500.0,9593.48,1648.8,0.6,0,51572.04,65.73,6.26,0,50
2,3,115,1449.85,608.43,88,287500.0,40.42,7.34,3340.0,0.18,...,250.0,150.0,45.13,9.33,1,31692.84,65.81,7.84,1,50
3,4,1201,1562.53,295.65,66,3002500.0,42.4,7.97,18030.0,0.19,...,761.58,453.21,144.97,13.33,1,37696.21,65.67,8.07,1,50
4,5,312,950.27,440.86,37,780000.0,41.43,7.03,3350.0,0.17,...,710.63,512.54,109.16,2.58,0,29038.17,65.66,7.35,0,50


In [24]:
df_unique = df.nunique()
df_unique
print(f"Number of columns before deletion: {df.shape[1]}")
cols_to_delete = [i for i, v in enumerate(df_unique) if v == 1]
print(f"Columns {cols_to_delete} need to be deleted")
df = df.drop(cols_to_delete, axis=1)
print(f"Number of columns after deletion: {df.shape[1]}")

Number of columns before deletion: 51
Columns [22, 50] need to be deleted
Number of columns after deletion: 49
