In [1]:
import sys
from pathlib import Path

module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd

import sklearn
import os


from sklearn.metrics import accuracy_score

import config.settings
import autosklearn
from sklearn.model_selection import train_test_split
from autosklearn import classification

In [3]:
def read_csv(path:os.path):
    """

    :return:
    """
    df = pd.read_csv((path))
    # train, test =sklearn.model_selection.train_test_split(df,test_size=0.2)

    return df


def scoring_function(estimator, X, Y):
    predictions = estimator.predict(X)
    return sklearn.metrics.accuracy_score(Y, predictions)



In [5]:
root = config.settings.get_project_path()
dataset_path = os.path.join(root,'Data')

df= read_csv(os.path.join(dataset_path,'train.csv'))
df_test = read_csv(os.path.join(dataset_path,'test.csv'))
# y_train = train_df['class']
# y_test = test_df['class']
# X_train = train_df.drop('class',axis=1)
# X_test = test_df.drop('class', axis =1)

X = df.loc[:, df.columns != 'class']

Y = df.loc[:, df.columns == 'class']
test_size = 0.2
shuffle = True

train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=test_size, shuffle=shuffle)

estimator = classification.AutoSklearnClassifier(
    include = {
        'classifier': ["adaboost",],
    },
    time_left_for_this_task=300,  # in seconds
    seed=42,
    resampling_strategy='cv',
    resampling_strategy_arguments={'shuffle': True, 'folds': 5},
    metric=autosklearn.metrics.accuracy,
    n_jobs=-1
)
estimator.fit(train_X, train_Y, dataset_name='hackathon_data')
print(f"Train Auto-Sklearn Classifier performance is {scoring_function(estimator, train_X, train_Y)}")
print(f"Test Auto-Sklearn Classifier performance is {scoring_function(estimator, test_X, test_Y)}")



Train Auto-Sklearn Classifier performance is 0.12599744853290643
Test Auto-Sklearn Classifier performance is 0.12156078039019509


In [6]:
test_X

Unnamed: 0,V1,V2,V3,V4,V6,V7,V8,V9,V10,V11,...,V91,V92,V93,V94,V95,V96,V97,V98,V99,V100
17342,241.872996,-888.957561,867.842355,416.869288,877.662166,-1461.917119,279.825118,-1466.941598,879.180060,-5382.657332,...,841.476492,151.183358,900.716629,233.969053,-734.449754,9876.563070,238.044837,484.445684,2,2
30130,27954.127479,379.081441,-3269.841069,421.018775,876.004410,2427.881461,-3514.747534,399.743590,882.791820,-8615.971074,...,1028.041158,150.365827,-15026.164497,308.515463,-673.069833,18066.874870,82.366289,657.091906,4,3
48914,-2342.805025,-693.246313,-754.393472,421.090820,877.290199,811.786895,-427.453512,-393.854173,880.545537,2511.924715,...,991.883495,148.528297,249.446717,410.587829,-753.049378,1879.786500,262.113596,479.028457,1,1
42624,2877.990897,-35.257740,-46.942829,421.476057,877.946386,-195.037406,-1629.286382,-1363.241102,886.200593,9315.114200,...,898.761246,151.211693,230.169859,45.115685,-786.009945,-1386.501050,382.816613,168.542471,1,1
22315,23265.936499,-4.495199,1161.019419,420.694330,870.969858,-20321.519668,533.961915,-326.241727,890.722327,781.365228,...,953.533715,151.543183,5359.594265,337.196783,-786.316958,12784.855133,465.342305,480.306088,5,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20461,1153.129933,-233.208596,-400.431638,420.841469,874.217640,-222.158978,-455.248318,-510.967480,880.055355,-3326.164664,...,871.903340,138.296390,113.176814,300.024479,-753.976645,-325.454749,280.033151,507.611732,1,1
28010,665.340335,-282.274023,-237.058757,420.304149,874.907235,337.702531,22.629385,-679.718912,875.716326,-1018.585701,...,920.399626,145.809520,-186.817760,399.594194,-732.834295,706.436236,226.431210,445.756667,1,1
37211,-13025.079515,-296.432723,296.811495,420.936313,880.550040,703.620095,671.185885,-732.972939,861.720289,-136.664066,...,952.089585,145.663784,4906.147912,280.026056,-850.145054,-12584.112075,138.051075,519.340547,1,1
10504,1637.319919,-2689.923913,-1746.776072,425.990780,888.619086,310.446890,531.139474,-973.642734,873.924272,285.438287,...,1770.792405,106.105427,66.009131,362.574445,-791.146325,1467.642948,287.886840,544.735138,3,1


In [7]:
print(estimator.show_models())



[(1.000000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'gaussian_nb', 'data_preprocessor:__choice__': 'feature_type', 'feature_preprocessor:__choice__': 'fast_ica', 'data_preprocessor:feature_type:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessor:feature_type:categorical_transformer:category_coalescence:__choice__': 'no_coalescense', 'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'median', 'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'none', 'feature_preprocessor:fast_ica:algorithm': 'deflation', 'feature_preprocessor:fast_ica:fun': 'cube', 'feature_preprocessor:fast_ica:whiten': 'False'},
dataset_properties={
  'task': 2,
  'sparse': False,
  'multilabel': False,
  'multiclass': True,
  'target_type': 'classification',
  'signed': False})),
]


In [8]:
estimator.leaderboard()

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20,1,1.0,gaussian_nb,0.879906,82.655758


In [21]:
df_test = read_csv(os.path.join(dataset_path,'test.csv'))

In [22]:
test = df_test.loc[:, df_test.columns != 'id']
id_ = df_test.loc[:,df_test.columns == 'id']

In [23]:
test_result = estimator.predict(test)

In [24]:
len(test_result)

66238

In [25]:
test_result

array(['BS', 'AN', 'CY', ..., 'BM', 'AN', 'BV'], dtype=object)

In [17]:
id_

Unnamed: 0,id
0,1
1,2
2,3
3,4
4,5
...,...
66201,66234
66202,66235
66203,66236
66204,66237


In [26]:
id_['Predicted'] = list(test_result)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  id_['Predicted'] = list(test_result)


In [27]:
id_.to_csv('submission.csv',index=False)