In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv('train.csv')

In [3]:
df_test = pd.read_csv('test.csv')

In [4]:
df_train = df_train.drop('id',axis = 1)

In [5]:
df_test = df_test.drop('id',axis = 1)

In [6]:
df_train = df_train.drop('location',axis = 1)

In [7]:
df_test = df_test.drop('location',axis = 1)

In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   keyword  7552 non-null   object
 1   text     7613 non-null   object
 2   target   7613 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 178.6+ KB


In [21]:
df_test.isna().sum()

keyword    26
text        0
dtype: int64

In [18]:
df_train['keyword'].value_counts()

keyword
fatalities               45
deluge                   42
armageddon               42
damage                   41
body%20bags              41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: count, Length: 221, dtype: int64

In [19]:
keyword_dist = df_train['keyword'].value_counts(normalize=True)

nan_count = df_train['keyword'].isna().sum()
random_keywords = np.random.choice(keyword_dist.index,size=nan_count,p=keyword_dist.values )

df_train.loc[df_train['keyword'].isna(), 'keyword'] = random_keywords

In [22]:
keyword_dist = df_test['keyword'].value_counts(normalize=True)

nan_count = df_test['keyword'].isna().sum()
random_keywords = np.random.choice(keyword_dist.index,size=nan_count,p=keyword_dist.values )

df_test.loc[df_test['keyword'].isna(), 'keyword'] = random_keywords

In [28]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text) 
    text = re.sub(r"[^a-z\s]", "", text)   
    text = re.sub(r"\s+", " ", text).strip() 
    return text

df_train['clean_text'] = df_train['text'].apply(clean_text)

In [31]:
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
cv = CountVectorizer(max_features=5000, ngram_range=(1,2))

In [33]:
X_count = cv.fit_transform(df_train['clean_text'])

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

In [38]:
X_tfidf = tfidf.fit_transform(df_train['clean_text'])

In [40]:
from sklearn.model_selection import train_test_split

In [43]:
y = df_train['target']

In [97]:
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=35)

# RandomForestClassifier + Tfidf

In [41]:
from sklearn.ensemble import RandomForestClassifier

In [54]:
model = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)


In [55]:
model.fit(X_train,y_train)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,20
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [56]:
from sklearn.metrics import classification_report, accuracy_score

In [57]:
y_pred = model.predict(X_val)

In [58]:
print("Accuracy:", accuracy_score(y_val, y_pred))

Accuracy: 0.7550886408404465


In [59]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.87      0.80       874
           1       0.78      0.60      0.68       649

    accuracy                           0.76      1523
   macro avg       0.76      0.73      0.74      1523
weighted avg       0.76      0.76      0.75      1523



# GridSearchCV

In [60]:
from sklearn.model_selection import GridSearchCV

In [61]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [15, 20, 25],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt'],
    'class_weight': ['balanced']
}

In [62]:
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    cv=3,
    scoring='f1',
    verbose=2,
    n_jobs=-1
)

In [63]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'class_weight': ['balanced'], 'max_depth': [15, 20, ...], 'max_features': ['sqrt'], 'min_samples_leaf': [1, 2], ...}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,25
,min_samples_split,5
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [64]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.87      0.80       874
           1       0.78      0.60      0.68       649

    accuracy                           0.76      1523
   macro avg       0.76      0.73      0.74      1523
weighted avg       0.76      0.76      0.75      1523



# XGboost

In [98]:
from xgboost import XGBClassifier

In [99]:
xgb_model = XGBClassifier(
    n_estimators=300,      
    learning_rate=0.05,   
    max_depth=5,                  
    min_child_weight=2,      
    subsample=0.85,             
    colsample_bytree=0.85,      
    gamma=1,     
    reg_alpha=0.5,           
    reg_lambda=1.0,         
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1                 
)

In [100]:
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.85
,device,
,early_stopping_rounds,
,enable_categorical,False


In [101]:
y_pred_xgb = xgb_model.predict(X_val)

In [102]:
print(classification_report(y_val, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.75      0.92      0.82       861
           1       0.85      0.60      0.70       662

    accuracy                           0.78      1523
   macro avg       0.80      0.76      0.76      1523
weighted avg       0.79      0.78      0.77      1523



In [103]:
X_test = tfidf.transform(df_test['text'])
y_test_pred = xgb_model.predict(X_test)

In [104]:
submission = pd.read_csv('sample_submission.csv')

In [105]:
submission['target'] = y_test_pred

In [106]:
submission.to_csv('sub2.csv',index = False, index_label = False)

# LogisticRegression

In [108]:
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y, test_size=0.2, random_state=35)

In [110]:
from sklearn.linear_model import LogisticRegression

In [112]:
logistic_model = LogisticRegression(
    C=1.0,
    class_weight='balanced',
    max_iter=1000,
    n_jobs=-1,
    random_state=42
)

In [113]:
logistic_model.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000


In [117]:
logistic_y_pred = logistic_model.predict(X_val)

In [116]:
print(classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       861
           1       0.77      0.74      0.76       662

    accuracy                           0.79      1523
   macro avg       0.79      0.79      0.79      1523
weighted avg       0.79      0.79      0.79      1523



In [120]:
submission['target'] = logistic_model.predict(X_test)

In [121]:
submission.to_csv("sub3.csv", index=False,index_label = False)
