In [1]:
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
processed_train = pd.read_csv('data/processed_data.csv')
processed_train.dropna(inplace=True)

In [3]:
test_data = pd.read_csv('data/test_processed.csv')

In [4]:
count_vectorizer = CountVectorizer()
X_train = count_vectorizer.fit_transform(processed_train['tweet'])

In [5]:
X_train.shape

(7272, 8332)

In [6]:
X_test = count_vectorizer.transform(test_data['tweet'])

In [7]:
X_test.shape

(1819, 8332)

In [14]:
from sklearn.metrics import f1_score, make_scorer

f1 = make_scorer(f1_score , average='macro')

In [18]:
xgb = XGBClassifier()

optimization_dict = {'max_depth': [2,3,4,5,6,7],
                     'n_estimators': [50,60,70,80,90,100,150,200]}

model = GridSearchCV(xgb, optimization_dict, 
                     scoring=f1, verbose=5)
model.fit(X_train, processed_train['sentiment'])

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] max_depth=2, n_estimators=50 ....................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........ max_depth=2, n_estimators=50, score=0.311, total=   3.2s
[CV] max_depth=2, n_estimators=50 ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.1s remaining:    0.0s


[CV] ........ max_depth=2, n_estimators=50, score=0.322, total=   2.9s
[CV] max_depth=2, n_estimators=50 ....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.9s remaining:    0.0s


[CV] ........ max_depth=2, n_estimators=50, score=0.299, total=   2.8s
[CV] max_depth=2, n_estimators=50 ....................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    8.8s remaining:    0.0s


[CV] ........ max_depth=2, n_estimators=50, score=0.292, total=   3.0s
[CV] max_depth=2, n_estimators=50 ....................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.8s remaining:    0.0s


[CV] ........ max_depth=2, n_estimators=50, score=0.308, total=   2.9s
[CV] max_depth=2, n_estimators=60 ....................................
[CV] ........ max_depth=2, n_estimators=60, score=0.317, total=   3.4s
[CV] max_depth=2, n_estimators=60 ....................................
[CV] ........ max_depth=2, n_estimators=60, score=0.325, total=   3.4s
[CV] max_depth=2, n_estimators=60 ....................................
[CV] ........ max_depth=2, n_estimators=60, score=0.308, total=   3.4s
[CV] max_depth=2, n_estimators=60 ....................................
[CV] ........ max_depth=2, n_estimators=60, score=0.301, total=   3.5s
[CV] max_depth=2, n_estimators=60 ....................................
[CV] ........ max_depth=2, n_estimators=60, score=0.315, total=   3.5s
[CV] max_depth=2, n_estimators=70 ....................................
[CV] ........ max_depth=2, n_estimators=70, score=0.321, total=   4.0s
[CV] max_depth=2, n_estimators=70 ....................................
[CV] .

[CV] ........ max_depth=3, n_estimators=90, score=0.338, total=   7.5s
[CV] max_depth=3, n_estimators=90 ....................................
[CV] ........ max_depth=3, n_estimators=90, score=0.329, total=   7.5s
[CV] max_depth=3, n_estimators=90 ....................................
[CV] ........ max_depth=3, n_estimators=90, score=0.368, total=   7.6s
[CV] max_depth=3, n_estimators=100 ...................................
[CV] ....... max_depth=3, n_estimators=100, score=0.349, total=   8.6s
[CV] max_depth=3, n_estimators=100 ...................................
[CV] ....... max_depth=3, n_estimators=100, score=0.368, total=   8.3s
[CV] max_depth=3, n_estimators=100 ...................................
[CV] ....... max_depth=3, n_estimators=100, score=0.339, total=   8.2s
[CV] max_depth=3, n_estimators=100 ...................................
[CV] ....... max_depth=3, n_estimators=100, score=0.331, total=   8.1s
[CV] max_depth=3, n_estimators=100 ...................................
[CV] .

[CV] ........ max_depth=5, n_estimators=50, score=0.367, total=   6.6s
[CV] max_depth=5, n_estimators=50 ....................................
[CV] ........ max_depth=5, n_estimators=50, score=0.368, total=   6.9s
[CV] max_depth=5, n_estimators=50 ....................................
[CV] ........ max_depth=5, n_estimators=50, score=0.327, total=   6.5s
[CV] max_depth=5, n_estimators=50 ....................................
[CV] ........ max_depth=5, n_estimators=50, score=0.329, total=   6.5s
[CV] max_depth=5, n_estimators=50 ....................................
[CV] ........ max_depth=5, n_estimators=50, score=0.359, total=   6.7s
[CV] max_depth=5, n_estimators=60 ....................................
[CV] ........ max_depth=5, n_estimators=60, score=0.372, total=   7.8s
[CV] max_depth=5, n_estimators=60 ....................................
[CV] ........ max_depth=5, n_estimators=60, score=0.374, total=   7.8s
[CV] max_depth=5, n_estimators=60 ....................................
[CV] .

[CV] ........ max_depth=6, n_estimators=80, score=0.358, total=  12.3s
[CV] max_depth=6, n_estimators=80 ....................................
[CV] ........ max_depth=6, n_estimators=80, score=0.387, total=  12.6s
[CV] max_depth=6, n_estimators=90 ....................................
[CV] ........ max_depth=6, n_estimators=90, score=0.381, total=  14.0s
[CV] max_depth=6, n_estimators=90 ....................................
[CV] ........ max_depth=6, n_estimators=90, score=0.381, total=  13.9s
[CV] max_depth=6, n_estimators=90 ....................................
[CV] ........ max_depth=6, n_estimators=90, score=0.349, total=  14.0s
[CV] max_depth=6, n_estimators=90 ....................................
[CV] ........ max_depth=6, n_estimators=90, score=0.373, total=  14.4s
[CV] max_depth=6, n_estimators=90 ....................................
[CV] ........ max_depth=6, n_estimators=90, score=0.387, total=  13.9s
[CV] max_depth=6, n_estimators=100 ...................................
[CV] .

[CV] ....... max_depth=7, n_estimators=200, score=0.416, total=  35.4s
[CV] max_depth=7, n_estimators=200 ...................................
[CV] ....... max_depth=7, n_estimators=200, score=0.357, total=  35.6s
[CV] max_depth=7, n_estimators=200 ...................................
[CV] ....... max_depth=7, n_estimators=200, score=0.396, total=  35.9s
[CV] max_depth=7, n_estimators=200 ...................................
[CV] ....... max_depth=7, n_estimators=200, score=0.410, total=  36.0s


[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 47.7min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_es...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=False,

In [19]:
print(model.best_score_)
print(model.best_params_)

0.39521781354717417
{'max_depth': 7, 'n_estimators': 200}


In [20]:
grid_pred = model.predict(X_test)

In [22]:
to_submit = pd.DataFrame({'tweet_id': test_data['tweet_id'], 'sentiment': grid_pred})
to_submit.to_csv('solutions/grid_pred.csv', index=False)
to_submit.head()

Unnamed: 0,tweet_id,sentiment
0,7506,1
1,7992,1
2,247,2
3,7688,2
4,3294,2


In [None]:
xgb.fit(X_train, processed_train['sentiment'])
xgb_pred = xgb.predict(X_test)

In [13]:
to_submit = pd.DataFrame({'tweet_id': test_data['tweet_id'], 'sentiment': xgb_pred})
to_submit.to_csv('weird_sols/xgb_pred.csv', index=False)
to_submit.head()

Unnamed: 0,tweet_id,sentiment
0,7506,1
1,7992,1
2,247,1
3,7688,2
4,3294,1


In [23]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb_pred = mnb.fit(X_train, processed_train['sentiment']).predict(X_test)
to_submit = pd.DataFrame({'tweet_id': test_data['tweet_id'], 'sentiment': mnb_pred})
to_submit.to_csv('solutions/mnb_pred.csv', index=False)
to_submit.head()

Unnamed: 0,tweet_id,sentiment
0,7506,1
1,7992,1
2,247,1
3,7688,2
4,3294,2
