In [1]:
%reload_ext autoreload
%autoreload 2
import ast
import pickle
import itertools
from collections import Counter
from tqdm import tqdm
import pandas as pd
import os

import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import cv2
import swifter

from analysis.generate_cluster_information_file import load, extract_all_information_query, to_df_query
from baseline.image_processing import pixel_intensity_histogram


from sklearn.model_selection import cross_val_score
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, make_scorer
import xgboost
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV

In [2]:
train_X = pd.read_csv('train_X.csv')
train_y = pd.read_csv('train_y.csv')

pd.to_numeric(train_X['cluster_num'])
pd.to_numeric(train_X['cluster_num_intensities_avg'])
pd.to_numeric(train_X['cluster_peak_intensities_avg'])
pd.to_numeric(train_X['cluster_x_avg'])
pd.to_numeric(train_X['cluster_y_avg'])

scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
train_y_scaled = scaler.fit_transform(train_y)

In [3]:
train_X_na = train_X.fillna(0)
train_X_na_scaled = scaler.fit_transform(train_X_na)

### XGBoost

In [37]:
model = xgboost.XGBRegressor(objective="reg:squarederror", learning_rate =0.01, booster='gbtree', n_estimators=1000, max_depth=6, gamma=0.5, subsample=0.7, colsample_bytree=1.0, nthread=-1, verbosity=1)
score_mi = cross_val_score(model, train_X_scaled, np.ravel(train_y_scaled), cv=5, scoring=make_scorer(mean_absolute_error))
print("Mean: {}, Std Dev: {}".format(score_mi.mean(), np.std(score_mi)))

Mean: 0.5432868851516257, Std Dev: 0.009487533968032862


### Linear Regression

In [5]:
clf = LinearRegression()
score_mi = cross_val_score(clf, train_X_na_scaled, np.ravel(train_y_scaled), cv=5, scoring=make_scorer(mean_absolute_error))
print("Mean: {}, Std Dev: {}".format(score_mi.mean(), np.std(score_mi)))

Mean: 0.6320241650837227, Std Dev: 0.011360807620951735


### SVM

**RBF Kernel**

In [113]:
svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)

score_mi = cross_val_score(svr_rbf, train_X_na_scaled, np.ravel(train_y_scaled), cv=5, scoring=make_scorer(mean_absolute_error))
print("Mean: {}, Std Dev: {}".format(score_mi.mean(), np.std(score_mi)))

Mean: 0.559523543350321, Std Dev: 0.013994445700721255


**Linear Kernel**

In [4]:
svr_lin = SVR(kernel='linear', C=100, gamma='auto', verbose=True)

score_mi = cross_val_score(svr_lin, train_X_na_scaled, np.ravel(train_y_scaled), cv=5, scoring=make_scorer(mean_absolute_error))
print("Mean: {}, Std Dev: {}".format(score_mi.mean(), np.std(score_mi)))

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]Mean: 0.6020756297432326, Std Dev: 0.0145266920724527


**Polynomial Kernal**

In [5]:
svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1,
               coef0=1)

score_mi = cross_val_score(svr_poly, train_X_na_scaled, np.ravel(train_y_scaled), cv=5, scoring=make_scorer(mean_absolute_error))
print("Mean: {}, Std Dev: {}".format(score_mi.mean(), np.std(score_mi)))

Mean: 0.5757662231202911, Std Dev: 0.011833120124295313


### Isolation Forest

In [6]:
isoforest = IsolationForest(random_state=0)

score_mi = cross_val_score(isoforest, train_X_na_scaled, np.ravel(train_y_scaled), cv=5, scoring=make_scorer(mean_absolute_error))
print("Mean: {}, Std Dev: {}".format(score_mi.mean(), np.std(score_mi)))

Mean: 0.9495764713124867, Std Dev: 0.015721076633129598


### Random Forest

In [7]:
randomforest = RandomForestRegressor(max_depth=6, random_state=0)

score_mi = cross_val_score(randomforest, train_X_na_scaled, np.ravel(train_y_scaled), cv=5, scoring=make_scorer(mean_absolute_error))
print("Mean: {}, Std Dev: {}".format(score_mi.mean(), np.std(score_mi)))

Mean: 0.5470079355789131, Std Dev: 0.007756689915654611


In [12]:
def get_features(test_df):
    cluster_sizes_avg = np.empty(len(test_df.index))
    cluster_peak_intensities_avg = np.empty(len(test_df.index))
    cluster_num_intensities_avg = np.empty(len(test_df.index))

    cluster_x_avg = np.empty(len(test_df.index))
    cluster_y_avg = np.empty(len(test_df.index))

    for i in tqdm(range(len(test_df.index))):
        c_s = test_df.iloc[i,2]
        c_p_i = test_df.iloc[i,3]
        c_n_i = test_df.iloc[i,4]
        cluster_sizes_avg[i] = np.average(np.array(c_s))
        cluster_peak_intensities_avg[i] = np.average(np.array(c_p_i))
        cluster_num_intensities_avg[i] = np.average(np.array(c_n_i))
        cluster_num = test_df.iloc[i,1]
        cluster_centers = test_df.iloc[i,5]
        temp_x = np.empty(cluster_num)
        temp_y = np.empty(cluster_num)
        for j in range(cluster_num):
            (x,y) = cluster_centers[j]
            temp_x[j] = x 
            temp_y[j] = y
        cluster_x_avg[i] = np.average(temp_x)
        cluster_y_avg[i] = np.average(temp_y)

    # [cluster_num, cluster_num_intensities_avg, cluster_peak_intensities_avg, cluster_x_avg, cluster_y_avg]
    eval_X = np.empty((len(test_df.index),5))

    for i in tqdm(range(len(test_df.index))):
        eval_X[i,0] = test_df.iloc[i,1]
        eval_X[i, 1] = cluster_num_intensities_avg[i]
        eval_X[i, 2] = cluster_peak_intensities_avg[i]
        eval_X[i, 3] = cluster_x_avg[i]
        eval_X[i, 4] = cluster_y_avg[i]
        
    return eval_X

In [22]:
test_df = pd.read_csv('test_df.csv')
# pd saves lists as strings, so we need to convert them to lists manually
test_df.cluster_sizes = test_df.cluster_sizes.swifter.apply(ast.literal_eval)
test_df.cluster_centers = test_df.cluster_centers.swifter.apply(ast.literal_eval)
test_df.cluster_peak_intensities = test_df.cluster_peak_intensities.swifter.apply(ast.literal_eval)
test_df.cluster_num_intensities = test_df.cluster_num_intensities.swifter.apply(ast.literal_eval)

assert type(test_df.cluster_sizes.tolist()[0]) == list

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1200.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1200.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1200.0, style=ProgressStyle(descriptio…




HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1200.0, style=ProgressStyle(descriptio…




In [38]:
test_df.head()

Unnamed: 0,background_threshold,cluster_num,cluster_sizes,cluster_peak_intensities,cluster_num_intensities,cluster_centers
0,0,94,"[6, 20, 109, 1, 8, 2, 4, 3, 57, 8, 32, 4, 5, 2...","[1, 3, 111, 1, 1, 1, 1, 1, 22, 1, 4, 1, 1, 1, ...","[1, 3, 36, 1, 1, 1, 1, 1, 17, 1, 4, 1, 1, 1, 2...","[(1, 39), (5, 295), (30, 107), (25, 286), (46,..."
1,187,20943,"[69, 1, 70, 4, 381, 111, 73, 70, 1, 16, 41, 26...","[254, 191, 254, 242, 254, 251, 251, 250, 219, ...","[33, 1, 36, 4, 62, 41, 37, 34, 1, 14, 22, 18, ...","[(1, 10), (0, 18), (2, 25), (1, 41), (17, 56),..."
2,0,64,"[4, 82, 11, 7, 2, 40, 102, 160, 3, 6, 7, 54, 3...","[1, 4, 2, 5, 1, 5, 85, 251, 1, 2, 1, 29, 1, 1,...","[1, 4, 2, 4, 1, 5, 34, 54, 1, 2, 1, 17, 1, 1, ...","[(0, 906), (19, 900), (61, 728), (130, 428), (..."
3,3,50,"[197, 9, 19, 59, 5, 10, 45, 18, 35, 84, 19, 20...","[26, 4, 7, 7, 4, 4, 19, 4, 13, 9, 6, 46, 49, 4...","[19, 1, 4, 4, 1, 1, 12, 1, 9, 6, 3, 30, 23, 1,...","[(17, 159), (22, 965), (50, 794), (101, 387), ..."
4,2,57,"[12, 16, 8, 120, 11, 14, 8, 25, 8, 5, 5, 7, 85...","[4, 3, 3, 5, 3, 3, 3, 6, 3, 3, 3, 3, 69, 255, ...","[2, 1, 1, 3, 1, 1, 1, 4, 1, 1, 1, 1, 28, 89, 1...","[(7, 567), (10, 67), (18, 448), (37, 14), (36,..."


In [25]:
del test_df['Unnamed: 0']
eval_X = get_features(test_df)

100%|█████████████████████████████████████████████████████████████████████████████| 1200/1200 [00:01<00:00, 672.78it/s]
100%|███████████████████████████████████████████████████████████████████████████| 1200/1200 [00:00<00:00, 78046.87it/s]


In [27]:
scaler = StandardScaler()
eval_X_scaled = scaler.fit_transform(eval_X)

In [39]:
model.fit(train_X_scaled, train_y_scaled)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1.0, gamma=0.5, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.01, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=-1, nthread=-1, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.7, tree_method=None,
             validate_parameters=False, verbosity=1)

In [40]:
predictions = model.predict(eval_X_scaled)
predictions = scaler.inverse_transform(predictions)

In [41]:
predictions_file_ids = [x.replace('.png', '') for x in os.listdir(os.path.join('data','query'))]
results = {'Id': predictions_file_ids, 'Predicted': predictions.reshape(-1)}
results = pd.DataFrame(data=results)
results

Unnamed: 0,Id,Predicted
0,1000956,0.673292
1,1007209,-0.027038
2,1016681,0.602007
3,1043763,0.441347
4,1051472,0.723479
...,...,...
1195,9962129,0.736932
1196,9968166,-0.006876
1197,9972585,0.595115
1198,9981103,0.454601


In [78]:
#Preparing output for the kaggle submission
query_ex_path = os.path.join('data', 'query_example.csv')
query_ex = pd.read_csv(query_ex_path)
query_ex = query_ex.iloc[:,[0]]

query_ex = query_ex.astype(str)
results['Id'].astype(str)

query_preds = pd.merge(query_ex, results, on=['Id'])
pd.to_numeric(query_preds['Predicted'])
query_preds.loc[query_preds['Predicted'] < 0.0, 'Predicted'] = 0.0
query_preds.loc[query_preds['Predicted'] > 8.0, 'Predicted'] = 8.0
query_preds.to_csv('out.csv', index=False)
query_preds

Unnamed: 0,Id,Predicted
0,7452475,0.585724
1,7071865,0.746074
2,9302616,0.462073
3,7562317,0.431731
4,5940084,0.614026
...,...,...
1195,9022030,0.459659
1196,2640528,1.056854
1197,1114602,0.916864
1198,6934039,0.683403
