In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import numpy as np
from sklearn.metrics import recall_score, precision_score
import imblearn
from xgboost import XGBClassifier

%matplotlib inline

In [2]:
df_train = pd.read_csv("train_dataset_train.csv")

In [29]:
df_train.head()

Unnamed: 0,id,Easting,Northing,Height,Reflectance,Class
0,2321251,431696.5375,6032319.0,69.2226,-11.14,0
1,3515173,431710.3835,6032291.0,68.9711,-15.16,3
2,2320295,431696.8099,6032322.0,69.2453,-13.59,0
3,2454459,431680.4542,6032343.0,69.1892,-11.21,0
4,4608150,431720.0914,6032288.0,67.3252,-9.1,0


In [3]:
df_train.loc[(df_train.Class == 64), 'Class'] = 2

In [31]:
df_train.Class.value_counts()

0    2731040
3    1287816
4     103453
5      55985
1      39384
2       4416
Name: Class, dtype: int64

In [48]:
X = df_train.drop(["Class", "id"], axis = 1)
y = df_train["Class"]

In [49]:
from imblearn.under_sampling import NearMiss
nm = NearMiss()
X_train_miss, Y_train_miss = nm.fit_resample(X, y.ravel())

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X_train_miss, Y_train_miss, random_state = 1)

In [51]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((19872, 4), (6624, 4), (19872,), (6624,))

In [36]:
from pandas import read_csv
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot
import numpy
# split data into X and y
X = X_train
y = y_train
# grid search
model = XGBClassifier()
n_estimators = [1000]
learning_rate = [0.01, 0.05, 0.1, 0.2, 0.3]
param_grid = dict(learning_rate=learning_rate, n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))
# plot results
scores = numpy.array(means).reshape(len(learning_rate), len(n_estimators))
for i, value in enumerate(learning_rate):
    pyplot.plot(n_estimators, scores[i], label='learning_rate: ' + str(value))
pyplot.legend()
pyplot.xlabel('n_estimators')
pyplot.ylabel('Log Loss')
pyplot.savefig('n_estimators_vs_learning_rate.png')

Best: -0.196090 using {'learning_rate': 0.05, 'n_estimators': 700}
-0.511518 (0.011194) with: {'learning_rate': 0.01, 'n_estimators': 200}
-0.263438 (0.011355) with: {'learning_rate': 0.01, 'n_estimators': 600}
-0.249373 (0.010914) with: {'learning_rate': 0.01, 'n_estimators': 700}
-0.240654 (0.010520) with: {'learning_rate': 0.01, 'n_estimators': 800}
-0.228677 (0.009908) with: {'learning_rate': 0.01, 'n_estimators': 1000}
-0.228273 (0.010060) with: {'learning_rate': 0.05, 'n_estimators': 200}
-0.196622 (0.009405) with: {'learning_rate': 0.05, 'n_estimators': 600}
-0.196090 (0.009455) with: {'learning_rate': 0.05, 'n_estimators': 700}
-0.196377 (0.009373) with: {'learning_rate': 0.05, 'n_estimators': 800}
-0.198224 (0.009788) with: {'learning_rate': 0.05, 'n_estimators': 1000}
-0.202103 (0.009391) with: {'learning_rate': 0.1, 'n_estimators': 200}
-0.202040 (0.010946) with: {'learning_rate': 0.1, 'n_estimators': 600}
-0.204925 (0.011176) with: {'learning_rate': 0.1, 'n_estimators': 700

In [4]:
X = df_train.drop(["Class", "id"], axis = 1)
y = df_train["Class"]
X.shape

(4222094, 4)

In [5]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_res, y_res = sm.fit_resample(X, y)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.65, shuffle = True, random_state = 1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5735184, 4), (10651056, 4), (5735184,), (10651056,))

In [7]:
my_model = XGBClassifier(n_estimators=1000, learning_rate=0.1)
my_model.fit(X_train, y_train, 
             early_stopping_rounds=10, 
             eval_set=[(X_test, y_test)], 
             verbose=False)
y_pred = my_model.predict(X_test)



In [8]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, mean_absolute_error
my_model_acc = accuracy_score(y_pred, y_test) * 100
print("доля верных прогнозов алгоритма:", my_model_acc)

доля верных прогнозов алгоритма: 98.63708349669741


In [9]:
test_data = pd.read_csv('test_dataset_test.csv')
features = ['Easting','Northing','Height','Reflectance']
test_X = test_data[features]
y_pred_all_data = my_model.predict(test_X)


In [10]:
output = pd.DataFrame({'id': test_data.id,
                       'Class': y_pred_all_data})
output['Class'] = np.where((output.Class == 2), 64, output.Class)
output['Class'].value_counts()

0     1149637
3      557241
4       51517
5       23940
1       16739
64      10395
Name: Class, dtype: int64

In [11]:
output.to_csv('submission97.csv', index=False)