In [1]:
import numpy as np
import pandas as pd
train = pd.read_csv('train.csv', sep=',')
test = pd.read_csv('test.csv', sep=',')
train.head()

Unnamed: 0,text,label
0,Taano ta ako pa?,0
1,Pag - isipan an halimbawa ni Twerter na sarong...,1
2,□ Taano ta kaipuhan pa nin dakol na tawo an so...,1
3,"An mga Pag - eksamin sa Paayadya, Asin an Pag ...",1
4,Si Andrew Cybbbber an nasa itaas na parte kan ...,1


In [2]:
train['label'].value_counts()

label
0    75817
1    74183
Name: count, dtype: int64

In [3]:
test.head()

Unnamed: 0,id,text
0,0,"Si Lolol, na arog kaini an ngaran, malinaw na ..."
1,1,Mga Pag - oswag sa kumusta Man?
2,2,"E.U.A., helingon an The Watchtower, Awake!"
3,3,An paggamit sana nin 70 letra sa ika - 70 letr...
4,4,Magtanaw sa Yanrans's ngonyan - na naglalaom n...


In [4]:
X = train['text'].astype(str)
y = train['label'].astype(int)
X_test = test['text'].astype(str)

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [6]:
tfidf = TfidfVectorizer(analyzer = 'char_wb', ngram_range = (1, 4), min_df = 8, max_features = 116700)

In [7]:
tfidf.fit(pd.concat([X, X_test]))
X_tfidf = tfidf.transform(X)
X_test_tfidf = tfidf.transform(X_test)

In [None]:
import optuna
from sklearn.model_selection import cross_val_score
def objective1(trial):
    penalty = trial.suggest_categorical('penalty', ['l2', 'l1'])
    solver = 'saga' if penalty == 'l1' else trial.suggest_categorical('solver', ['lbfgs', 'saga'])
    C = trial.suggest_float('C', 1e-4, 100, log=True)
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
    model = LogisticRegression(penalty=penalty, C=C, solver=solver, class_weight=class_weight, random_state=42)
    scores = cross_val_score(model, X_tfidf, y, cv=5, scoring='roc_auc', n_jobs=-1)
    return np.mean(scores)


study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective1, n_trials=100, show_progress_bar=True)
print("Лучшие параметры:", study.best_params)
print("Лучший ROC-AUC:", study.best_value)

In [9]:
logreg = LogisticRegression(penalty='l1', C = 1.3869238794158671, class_weight = 'balanced', solver='liblinear')
logreg.fit(X_tfidf, y)
y_pred = logreg.predict_proba(X_test_tfidf)
g = {'id': test['id'], 'sentiment': y_pred[:, 1]}
g = pd.DataFrame(g)
g.to_csv('subm3.csv', sep=',', index=False)

In [None]:
from sklearn.naive_bayes import MultinomialNB
def objective2(trial):
    alpha = trial.suggest_float('alpha', 1e-3, 10.0, log=True)
    fit_prior = trial.suggest_categorical('fit_prior', [True, False])
    model = MultinomialNB(alpha=alpha, fit_prior=fit_prior)
    scores = cross_val_score(model, X_tfidf, y, cv=5, scoring='roc_auc', n_jobs=-1)
    return np.mean(scores)


study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective2, n_trials=470, show_progress_bar=True)
print("Лучшие параметры:", study.best_params)
print("Лучший ROC-AUC:", study.best_value)
#Лучшие параметры: {'alpha': 3.159856116317881, 'fit_prior': False}
#Лучший ROC-AUC: 0.7394042045339771


In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=967, max_depth=77, min_samples_split = 6, min_samples_leaf = 5, max_features=None)
print(cross_val_score(model, X_tfidf, y, cv=5, scoring='roc_auc', n_jobs=-1))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
def objective4(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 500)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    subsample = trial.suggest_float('subsample', 0.6, 1.0)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    
    model = GradientBoostingClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        subsample=subsample,
        max_features=max_features,
        random_state=42
    )
    scores = cross_val_score(model, X_tfidf, y, cv=5, scoring='roc_auc', n_jobs=1)
    return np.mean(scores)


study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective4, n_trials=100, show_progress_bar=True)
print("Лучшие параметры:", study.best_params)
print("Лучший ROC-AUC:", study.best_value)
#Trial 0 finished with value: 0.735127175768006 and parameters: {'n_estimators': 218, 'learning_rate': 0.2536999076681772, 'max_depth': 8, 'min_samples_split': 13, 'min_samples_leaf': 2, 'subsample': 0.662397808134481, 'max_features': 'log2'}. Best is trial 0 with value: 0.735127175768006.

In [None]:
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold, cross_val_score

clf = SklearnClassifier(SVC(C=1.0, tol=1e-4, max_iter=20000, probability=True, class_weight='balanced', random_state=42))
cal = CalibratedClassifierCV(clf, method='sigmoid', cv=5)  
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(cal, X_tfidf, y, cv=cv, scoring='roc_auc', n_jobs=-1)
print('CV AUC mean:', np.mean(scores))

In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
from scipy.sparse import hstack
import lightgbm as lgb

RND=42
np.random.seed(RND)


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

def norm(s):
    import re
    if pd.isna(s): return ''
    s=str(s)
    s=s.replace('\n',' ').replace('\r',' ')
    s=re.sub(r'\s+',' ',s).strip()
    return s
train['text']=train['text'].astype(str).map(norm)
test['text']=test['text'].astype(str).map(norm)


word_max = 40000
char_max = 10000
word_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,3), max_features=word_max, lowercase=True)
char_vect = TfidfVectorizer(analyzer='char', ngram_range=(3,5), max_features=char_max)

print('Fitting TF-IDF on train...')
Xw = word_vect.fit_transform(train['text'].tolist())
Xc = char_vect.fit_transform(train['text'].tolist())
X_train = hstack([Xw, Xc]).tocsr()
y = train['label'].values

print('Transforming test...')
Xw_test = word_vect.transform(test['text'].tolist())
Xc_test = char_vect.transform(test['text'].tolist())
X_test = hstack([Xw_test, Xc_test]).tocsr()

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RND)

oof_lr = np.zeros(len(train))
oof_lgb = np.zeros(len(train))
oof_sgd = np.zeros(len(train))
test_lr = np.zeros(len(test))
test_lgb = np.zeros(len(test))
test_sgd = np.zeros(len(test))

for fold,(tr,vl) in enumerate(skf.split(np.zeros(len(y)), y),1):
    print('Fold',fold)
    X_tr = X_train[tr]; X_val = X_train[vl]
    y_tr = y[tr]; y_val = y[vl]

    # 1) LogisticRegression (with best params)
    lr = LogisticRegression(penalty='l1', C=1.3869238794158671, class_weight='balanced', solver='liblinear', max_iter=1000, random_state=RND)
    lr.fit(X_tr, y_tr)
    oof_lr[vl] = lr.predict_proba(X_val)[:,1]
    test_lr += lr.predict_proba(X_test)[:,1] / n_splits

    # 2) LightGBM (fast)
    lgbm = lgb.LGBMClassifier(objective='binary', learning_rate=0.05, n_estimators=1000, num_leaves=31, n_jobs=8, random_state=RND)
    lgbm.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    callbacks=[lgb.early_stopping(80)]
)
    oof_lgb[vl] = lgbm.predict_proba(X_val)[:,1]
    test_lgb += lgbm.predict_proba(X_test)[:,1] / n_splits

    # 3) SGDClassifier 
    sgd = SGDClassifier(loss='log_loss', penalty='l2', max_iter=2000, tol=1e-4, random_state=RND)
    sgd.fit(X_tr, y_tr)
    try:
        oof_sgd[vl] = sgd.predict_proba(X_val)[:,1]
        test_sgd += sgd.predict_proba(X_test)[:,1] / n_splits
    except:
        df_val = sgd.decision_function(X_val)
        df_test = sgd.decision_function(X_test)
        df_min, df_max = df_val.min(), df_val.max()
        if df_max - df_min == 0:
            probs_val = 0.5 + 0*df_val
            probs_test = 0.5 + 0*df_test
        else:
            probs_val = (df_val - df_min) / (df_max - df_min)
            probs_test = (df_test - df_min) / (df_max - df_min)
        oof_sgd[vl] = probs_val
        test_sgd += probs_test / n_splits

print('LR OOF AUC:', roc_auc_score(y, oof_lr))
print('LGB OOF AUC:', roc_auc_score(y, oof_lgb))
print('SGD OOF AUC:', roc_auc_score(y, oof_sgd))

# Stack/meta model
X_oof_stack = np.vstack([oof_lr, oof_lgb, oof_sgd]).T
X_test_stack = np.vstack([test_lr, test_lgb, test_sgd]).T

meta = LogisticRegression(C=1.0, penalty='l2', max_iter=1000, random_state=RND)
meta.fit(X_oof_stack, y)
final_test = meta.predict_proba(X_test_stack)[:,1]


outdir = Path('output'); outdir.mkdir(exist_ok=True)
pd.DataFrame({'label':train['label'],'oof_lr':oof_lr,'oof_lgb':oof_lgb,'oof_sgd':oof_sgd}).to_csv(outdir/'oof_quick.csv',index=False)
pd.DataFrame({'id':test['id'],'sentiment':final_test}).to_csv(outdir/'submission_quick_ensemble.csv',index=False)
print('Saved submission to', outdir/'submission_quick_ensemble.csv')
