In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import json
import xgboost as xg
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
import torch
from torch import nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [17]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\iyeng\AppData\Roaming\nltk_data...


True

In [196]:
from sklearn.decomposition import PCA
from textblob import TextBlob
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
clean_df = pd.read_csv("cleaned_subset.csv")
clean_df.dropna(inplace=True)
clean_df['target'] = clean_df['avg_score']/2
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2044 entries, 0 to 2044
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   clean_title    2044 non-null   object 
 1   clean_comment  2044 non-null   object 
 2   url            2044 non-null   object 
 3   avg_score      2044 non-null   float64
 4   target         2044 non-null   float64
dtypes: float64(2), object(3)
memory usage: 95.8+ KB


In [None]:
# X_train, X_test, y_train, y_test = train_test_split(clean_df['clean_comment'], clean_df['target'], test_size = 0.2, random_state = 123)


## Custom models

In [285]:
right = """right wing, RW, authority, hierarchy, order, duty, tradition, reaction, nationalism, conservative, right-libertarian, \
neoconservative, imperialist, monarchist, fascist, reactionaries, traditionalist, traditional, death penalty, \
religion, Bhajpa, BJP, Shiv Sena, RSS, MNS, Sanatan, dharm, Hindutva, Islamophobia, Narendra, Modi, Amit, Shah, \
mandir, ram, valmiki, ramayan, Bharatiya, Janata, Democratic Alliance, NDA, AIADMK, Janta Dal, bhakt, CAA, NRC, hindu majority, \
hindu unity, hindu pride, nationalist, sangh, sanghi, yogi""".lower()
left = """left wing, LW, leftists, freedom, equality, fraternity, rights, progress, reform, internationalism, anarchist, communist, socialist, \
democratic socialist, social democrat, left-libertarian, progressive, social, liberal, Congress, UPA, \
INC, Aam, aadmi, AAP, CPI, CPI(M), Welfare, Protectionism, Commies, Rahul, gandhi, indira, yatra, arvind, kejriwal, \
libby, libbies, sjw, libtard, hinduphobia, LGBTQ, masjid, pappu, christian, muslim, secular, minority, minorities, Shashi, Tharoor""".lower()

In [286]:
right_terms = set(right.split(', '))
left_terms = set(left.split(', '))
len(right_terms), len(left_terms)

(53, 51)

In [18]:
vader = SentimentIntensityAnalyzer()

In [22]:
clean_df['clean_comment'][0]

'Extremely valid points but I believe he has the charisma to win over urban youth who is usually the most apathetic non voter. Although, I would keep my expectations low since Tharoor has a ton of intra party politics to deal with.'

In [25]:
vader.polarity_scores(clean_df['clean_comment'][0])['compound']

0.5941

In [306]:
def unsupervised_prediction(clean_text):
    sentiment = vader.polarity_scores(clean_text)['compound']
    lw = rw = 0
    clean_text = clean_text.lower()
    for t in left_terms:
        if re.search(r'\b'+t+r'\b', clean_text):
            lw += 1
    for t in right_terms:
        if re.search(r'\b'+t+r'\b', clean_text):
            rw += 1
    
    score = (rw-lw)*sentiment
    # normalize by length of text
    score /= len(clean_text.split())/20
    
    return score

In [307]:
pred_df = clean_df.copy()
pred_df['vader_pred'] = pred_df.apply(lambda x: unsupervised_prediction(x['clean_comment']), axis=1)
pred_df.head()

Unnamed: 0,clean_title,clean_comment,url,avg_score,target,vader_pred
0,Shashi Tharoor Set To Run For Congress Preside...,Extremely valid points but I believe he has th...,/r/india/comments/xif8wm/shashi_tharoor_set_to...,-1.0,-0.5,-0.282905
1,Yogi government puts Kolkata's image as part o...,Even if the ad was designed by the newspaper t...,/r/india/comments/pmn9o3/yogi_government_puts_...,-1.0,-0.5,-0.0
2,"An attempt to address the list of ""simplified""...",Diverse population including Muslims. Welcomin...,/r/india/comments/ebdeup/an_attempt_to_address...,-1.5,-0.75,-0.25437
3,You guys noticing what's happening in Sri Lanka?,What a joke. They didn't create any propaganda...,/r/india/comments/tt1ryh/you_guys_noticing_wha...,0.0,0.0,-0.0
4,Just got abused on the train by a hyper nation...,Ohoo bahut bura laga ye sunke ki aap Undergarm...,/r/india/comments/rh2kcs/just_got_abused_on_th...,-2.0,-1.0,-0.257458


In [72]:
def conv_to_label(val, threshold=0.25):
    if abs(val)<threshold:
        return 0.
    else:
        return np.sign(val)

In [308]:
pred_df['y_true'] = pred_df.apply(lambda x: np.sign(x['target']), axis=1)
pred_df['y_pred'] = pred_df.apply(lambda x: conv_to_label(x['vader_pred'], threshold=0.2), axis=1)
pred_df.head()

Unnamed: 0,clean_title,clean_comment,url,avg_score,target,vader_pred,y_true,y_pred
0,Shashi Tharoor Set To Run For Congress Preside...,Extremely valid points but I believe he has th...,/r/india/comments/xif8wm/shashi_tharoor_set_to...,-1.0,-0.5,-0.282905,-1.0,-1.0
1,Yogi government puts Kolkata's image as part o...,Even if the ad was designed by the newspaper t...,/r/india/comments/pmn9o3/yogi_government_puts_...,-1.0,-0.5,-0.0,-1.0,0.0
2,"An attempt to address the list of ""simplified""...",Diverse population including Muslims. Welcomin...,/r/india/comments/ebdeup/an_attempt_to_address...,-1.5,-0.75,-0.25437,-1.0,-1.0
3,You guys noticing what's happening in Sri Lanka?,What a joke. They didn't create any propaganda...,/r/india/comments/tt1ryh/you_guys_noticing_wha...,0.0,0.0,-0.0,0.0,0.0
4,Just got abused on the train by a hyper nation...,Ohoo bahut bura laga ye sunke ki aap Undergarm...,/r/india/comments/rh2kcs/just_got_abused_on_th...,-2.0,-1.0,-0.257458,-1.0,-1.0


In [309]:
np.mean(pred_df['y_true']==pred_df['y_pred'])  # 3-class accuracy

0.450587084148728

In [267]:
def feature_extraction(clean_text):
    clean_text = clean_text.lower()
    features = []
    for t in left_terms:
        features.append(len(re.findall(r'\b'+t+r'\b', clean_text)))
    for t in right_terms:
        features.append(len(re.findall(r'\b'+t+r'\b', clean_text)))
    
    return features

In [268]:
feature_extraction(clean_df['clean_comment'][0])

[0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [189]:
def add_sentiment_features(clean_text, extracted_feat):
    temp = extracted_feat.copy()
    temp.append(vader.polarity_scores(clean_text)['compound'])
    tb = TextBlob(clean_text)
    temp.append(tb.sentiment.polarity)
    temp.append(tb.sentiment.subjectivity)
    return temp

In [120]:
feature_df = clean_df.copy()
feature_df['extracted_features'] = feature_df.apply(lambda x: feature_extraction(x['clean_comment']), axis=1)
feature_df['final_features'] = feature_df.apply(lambda x: add_sentiment_features(x['clean_comment'], x['extracted_features']), axis=1)
feature_df

Unnamed: 0,clean_title,clean_comment,url,avg_score,target,extracted_features,final_features
0,Shashi Tharoor Set To Run For Congress Preside...,Extremely valid points but I believe he has th...,/r/india/comments/xif8wm/shashi_tharoor_set_to...,-1.0,-0.50,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Yogi government puts Kolkata's image as part o...,Even if the ad was designed by the newspaper t...,/r/india/comments/pmn9o3/yogi_government_puts_...,-1.0,-0.50,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"An attempt to address the list of ""simplified""...",Diverse population including Muslims. Welcomin...,/r/india/comments/ebdeup/an_attempt_to_address...,-1.5,-0.75,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,You guys noticing what's happening in Sri Lanka?,What a joke. They didn't create any propaganda...,/r/india/comments/tt1ryh/you_guys_noticing_wha...,0.0,0.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Just got abused on the train by a hyper nation...,Ohoo bahut bura laga ye sunke ki aap Undergarm...,/r/india/comments/rh2kcs/just_got_abused_on_th...,-2.0,-1.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...
2040,"An attempt to address the list of ""simplified""...",They dont have to prove persecution. It will b...,/r/india/comments/ebdeup/an_attempt_to_address...,0.0,0.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2041,ResignModi trending in India on 1 with 200k tw...,Both parties are busy in just fighting and win...,/r/india/comments/n11sqc/resignmodi_trending_i...,1.0,0.50,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2042,Suresh Chavanke Not a Dalit Sikh but a Christi...,My main point is that separatist movement isn'...,/r/IndiaSpeaks/comments/prtiwo/suresh_chavanke...,-1.0,-0.50,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
2043,Bahut jagah hai arey nahin jagah hai.. ft. CON...,when the opposition become so incompetent...Mo...,/r/IndiaSpeaks/comments/w8ib4s/bahut_jagah_hai...,1.5,0.75,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [121]:
X_train, X_test, y_train, y_test = train_test_split(list(feature_df['final_features']), list(feature_df['target']), test_size = 0.2, random_state = 123)

In [122]:
xgb = xg.XGBRegressor(n_estimators=100, max_depth=4, learning_rate=0.05)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

0.41188267362749953

In [269]:
pca = PCA(n_components=8)
feature_df = clean_df.copy()
feature_df['extracted_features'] = feature_df.apply(lambda x: feature_extraction(x['clean_comment']), axis=1)
feature_df['pca_features'] = pca.fit_transform(list(feature_df['extracted_features'])).tolist()
feature_df.head()

Unnamed: 0,clean_title,clean_comment,url,avg_score,target,extracted_features,pca_features
0,Shashi Tharoor Set To Run For Congress Preside...,Extremely valid points but I believe he has th...,/r/india/comments/xif8wm/shashi_tharoor_set_to...,-1.0,-0.5,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.23297564254883493, -0.13876301901895569, 0..."
1,Yogi government puts Kolkata's image as part o...,Even if the ad was designed by the newspaper t...,/r/india/comments/pmn9o3/yogi_government_puts_...,-1.0,-0.5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.23515893815269628, -0.13195145112349999, -..."
2,"An attempt to address the list of ""simplified""...",Diverse population including Muslims. Welcomin...,/r/india/comments/ebdeup/an_attempt_to_address...,-1.5,-0.75,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.20015210423580698, -0.05676285540707755, -..."
3,You guys noticing what's happening in Sri Lanka?,What a joke. They didn't create any propaganda...,/r/india/comments/tt1ryh/you_guys_noticing_wha...,0.0,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.23515893815269623, -0.13195145112349838, -..."
4,Just got abused on the train by a hyper nation...,Ohoo bahut bura laga ye sunke ki aap Undergarm...,/r/india/comments/rh2kcs/just_got_abused_on_th...,-2.0,-1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.15069254877229663, -0.13615278613879428, -..."


In [270]:
feature_df['final_features'] = feature_df.apply(lambda x: add_sentiment_features(x['clean_comment'], x['pca_features']), axis=1)
feature_df.head()

Unnamed: 0,clean_title,clean_comment,url,avg_score,target,extracted_features,pca_features,final_features
0,Shashi Tharoor Set To Run For Congress Preside...,Extremely valid points but I believe he has th...,/r/india/comments/xif8wm/shashi_tharoor_set_to...,-1.0,-0.5,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.23297564254883493, -0.13876301901895569, 0...","[-0.23297564254883493, -0.13876301901895569, 0..."
1,Yogi government puts Kolkata's image as part o...,Even if the ad was designed by the newspaper t...,/r/india/comments/pmn9o3/yogi_government_puts_...,-1.0,-0.5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.23515893815269628, -0.13195145112349999, -...","[-0.23515893815269628, -0.13195145112349999, -..."
2,"An attempt to address the list of ""simplified""...",Diverse population including Muslims. Welcomin...,/r/india/comments/ebdeup/an_attempt_to_address...,-1.5,-0.75,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.20015210423580698, -0.05676285540707755, -...","[-0.20015210423580698, -0.05676285540707755, -..."
3,You guys noticing what's happening in Sri Lanka?,What a joke. They didn't create any propaganda...,/r/india/comments/tt1ryh/you_guys_noticing_wha...,0.0,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.23515893815269623, -0.13195145112349838, -...","[-0.23515893815269623, -0.13195145112349838, -..."
4,Just got abused on the train by a hyper nation...,Ohoo bahut bura laga ye sunke ki aap Undergarm...,/r/india/comments/rh2kcs/just_got_abused_on_th...,-2.0,-1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.15069254877229663, -0.13615278613879428, -...","[-0.15069254877229663, -0.13615278613879428, -..."


In [153]:
def convert_to_classifier(targets, preds, threshold=1/3):
    new_targets = [np.sign(x) for x in targets]
    new_preds = [np.sign(x) if abs(x)>threshold else 0. for x in preds]
    return new_targets, new_preds

In [192]:
X_train, X_test, y_train, y_test = train_test_split(list(feature_df['final_features']), list(feature_df['target']), test_size = 0.2, random_state = 123)
xgb = xg.XGBRegressor(n_estimators=100, max_depth=4, learning_rate=0.05)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

0.4196762246523906

In [193]:
new_y_test, new_y_pred = convert_to_classifier(y_test, y_pred, threshold=0.08)
np.mean([int(new_y_test[i]==new_y_pred[i]) for i in range(len(new_y_test))])

0.5036674816625917

In [311]:
X_train, X_test, y_train, y_test = train_test_split(list(feature_df['final_features']), [np.sign(x)+1 for x in list(feature_df['target'])], test_size = 0.2, random_state = 123)
xgb = xg.XGBClassifier(n_estimators=100, max_depth=4, learning_rate=0.005)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
np.mean([int(y_test[i]==y_pred[i]) for i in range(len(y_test))])

0.508557457212714

In [281]:
confusion_matrix(y_test, y_pred)

array([[ 58,  72,  10],
       [ 31, 134,   7],
       [ 27,  54,  16]], dtype=int64)

In [282]:
print(classification_report(y_test, y_pred, target_names=['Left', 'Neutral', 'Right']))

              precision    recall  f1-score   support

        Left       0.50      0.41      0.45       140
     Neutral       0.52      0.78      0.62       172
       Right       0.48      0.16      0.25        97

    accuracy                           0.51       409
   macro avg       0.50      0.45      0.44       409
weighted avg       0.50      0.51      0.47       409



##

## Binary classification

In [216]:
pca = PCA(n_components=8)
feature_df = clean_df[clean_df['target']!=0].copy()
feature_df.reset_index(inplace=True, drop=True)
feature_df['extracted_features'] = feature_df.apply(lambda x: feature_extraction(x['clean_comment']), axis=1)
feature_df['pca_features'] = pca.fit_transform(list(feature_df['extracted_features'])).tolist()
feature_df['final_features'] = feature_df.apply(lambda x: add_sentiment_features(x['clean_comment'], x['pca_features']), axis=1)
feature_df

Unnamed: 0,clean_title,clean_comment,url,avg_score,target,extracted_features,pca_features,final_features
0,Shashi Tharoor Set To Run For Congress Preside...,Extremely valid points but I believe he has th...,/r/india/comments/xif8wm/shashi_tharoor_set_to...,-1.0,-0.50,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.3234842281799675, -0.1535394331148085, -0....","[-0.3234842281799675, -0.1535394331148085, -0...."
1,Yogi government puts Kolkata's image as part o...,Even if the ad was designed by the newspaper t...,/r/india/comments/pmn9o3/yogi_government_puts_...,-1.0,-0.50,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.3209315480018639, -0.14966917135297017, -0...","[-0.3209315480018639, -0.14966917135297017, -0..."
2,"An attempt to address the list of ""simplified""...",Diverse population including Muslims. Welcomin...,/r/india/comments/ebdeup/an_attempt_to_address...,-1.5,-0.75,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.3046462399679741, -0.07725001302534959, 0....","[-0.3046462399679741, -0.07725001302534959, 0...."
3,Just got abused on the train by a hyper nation...,Ohoo bahut bura laga ye sunke ki aap Undergarm...,/r/india/comments/rh2kcs/just_got_abused_on_th...,-2.0,-1.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.24153951013944386, -0.1863100882540193, -0...","[-0.24153951013944386, -0.1863100882540193, -0..."
4,"Defeat BJP Mission Uttar Pradesh,' Farmers to ...",Not really. As the country has seen for the la...,/r/india/comments/nmyqu4/defeat_bjp_mission_ut...,-2.0,-1.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.5873634681978407, -0.3885404504189382, 0.24...","[0.5873634681978407, -0.3885404504189382, 0.24..."
...,...,...,...,...,...,...,...,...
1188,BJP Gujarat 2022 Manifesto.,"this is why bjp wins and would keep winning, r...",/r/IndiaSpeaks/comments/z5u6kf/bjp_gujarat_202...,2.0,1.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.9493788158936218, 0.0938133025320356, -0.30...","[0.9493788158936218, 0.0938133025320356, -0.30..."
1189,Kejriwal says The Kashmir Files is a jhoothi f...,Mujhe ghanta fark nahi padta. Ghabrana tujhe h...,/r/IndiaSpeaks/comments/tmhqqg/kejriwal_says_t...,1.5,0.75,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[-0.3209315480018639, -0.14966917135296984, -0...","[-0.3209315480018639, -0.14966917135296984, -0..."
1190,ResignModi trending in India on 1 with 200k tw...,Both parties are busy in just fighting and win...,/r/india/comments/n11sqc/resignmodi_trending_i...,1.0,0.50,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.6101148015384652, -0.42920286864883467, 0.2...","[0.6101148015384652, -0.42920286864883467, 0.2..."
1191,Suresh Chavanke Not a Dalit Sikh but a Christi...,My main point is that separatist movement isn'...,/r/IndiaSpeaks/comments/prtiwo/suresh_chavanke...,-1.0,-0.50,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[-0.36731868940660556, 0.44877057569072865, 0....","[-0.36731868940660556, 0.44877057569072865, 0...."


In [317]:
X_train, X_test, y_train, y_test = train_test_split(list(feature_df['final_features']), [1 if x>0 else 0 for x in list(feature_df['target'])], test_size = 0.2, random_state = 123)
xgb = xg.XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)
np.mean([int(y_test[i]==y_pred[i]) for i in range(len(y_test))])

0.7408312958435208

In [248]:
confusion_matrix(y_test, y_pred)

array([[87, 35],
       [70, 47]], dtype=int64)

In [249]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.71      0.62       122
           1       0.57      0.40      0.47       117

    accuracy                           0.56       239
   macro avg       0.56      0.56      0.55       239
weighted avg       0.56      0.56      0.55       239

