In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import json
import xgboost as xg
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from textblob import TextBlob
import torch
from torch import nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Error loading vader_lexicon: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1129)>


device(type='cpu')

In [2]:
# start here
train_df = pd.read_csv('train_full.csv')
eval_df = pd.read_csv('eval_full.csv')

In [3]:
train_df['true_label'] = train_df.apply(lambda x: np.sign(x['target'])+1, axis=1)
train_df.head()

Unnamed: 0,clean_title,clean_comment,comment_id,url,target,true_label
0,BJP Gujarat 2022 Manifesto.,There are two types of parties those that prom...,iy20u35,/r/IndiaSpeaks/comments/z5u6kf/bjp_gujarat_202...,0.0,1.0
1,Rajasthan BJP MLA Sanjay Sharma posted this on...,"If fundamentals are sound then, faith can help...",hnc96s6,/r/IndiaSpeaks/comments/r9df3s/rajasthan_bjp_m...,0.75,2.0
2,The new Indian If Trump's supporters blame Ant...,"Contrary to popular belief, until Modi, India ...",giqnuc2,/r/india/comments/ku5i9f/the_new_indian_if_tru...,-0.5,0.0
3,Kejriwal says that Mukesh Ambani's Antilia is ...,Bro inhone bahut land hijack kiya hai kisse pa...,iq3l8o5,/r/IndiaSpeaks/comments/xpb83d/kejriwal_says_t...,-0.25,0.0
4,The picture pretty much sums up the ideology o...,The 1.9 million denied citizenship in Assam al...,fbtqycg,/r/india/comments/ee11sp/the_picture_pretty_mu...,-1.0,0.0


In [4]:
eval_df['true_label'] = eval_df.apply(lambda x: np.sign(x['target'])+1, axis=1)
eval_df.head()

Unnamed: 0,clean_title,clean_comment,comment_id,url,target,true_label
0,ResignModi trending in India on 1 with 200k tw...,"People tend to look down on ""social media acti...",gwe056n,/r/india/comments/n11sqc/resignmodi_trending_i...,-0.5,0.0
1,"Put 'The Kashmir Files' on YouTube, everyone w...","I remember it was made to release twice, and t...",i1y3pv3,/r/india/comments/tmcun4/put_the_kashmir_files...,0.0,1.0
2,Kejriwal says 'The Kashmir Files' is a jhoothi...,Bhai delhi m hindu bht h but ye aur caste k lo...,i21haf8,/r/IndiaSpeaks/comments/tmhqqg/kejriwal_says_t...,0.5,2.0
3,Government of India has Blacklisted Karl Rock ...,mahatma gandhi's shadows are fading. retards w...,h4lazw7,/r/india/comments/ogrc0d/government_of_india_h...,-1.0,0.0
4,Kejriwal says 'The Kashmir Files' is a jhoothi...,Isn't this kejru the same guy who kept tweetin...,i1yrkwx,/r/IndiaSpeaks/comments/tmhqqg/kejriwal_says_t...,0.5,2.0


In [5]:
right = """right wing, RW, authority, hierarchy, order, duty, tradition, reaction, nationalism, conservative, right-libertarian, \
neoconservative, imperialist, monarchist, fascist, reactionaries, traditionalist, traditional, death penalty, \
religion, Bhajpa, BJP, Shiv Sena, RSS, MNS, Sanatan, dharm, Hindutva, Islamophobia, Narendra, Modi, Amit, Shah, \
mandir, ram, valmiki, ramayan, Bharatiya, Janata, Democratic Alliance, NDA, AIADMK, Janta Dal, bhakt, CAA, NRC, hindu majority, \
hindu unity, hindu pride, nationalist, sangh, sanghi, yogi, brahmin, brahman, smriti irani, hindu rashtra, jai shri ram, \
pm cares, pmcares, adani, hindu""".lower()
left = """left wing, LW, leftists, freedom, equality, fraternity, rights, progress, reform, internationalism, anarchist, communist, socialist, \
democratic socialist, social democrat, left-libertarian, progressive, social, liberal, western, Congress, UPA, RG, mamata, \
Aam, aadmi, AAP, CPI, Welfare, Protectionism, Commies, Rahul, gandhi, indira, yatra, arvind, kejriwal, inclusivity, \
libby, libbies, sjw, libtard, hinduphobia, LGBTQ, masjid, pappu, christian, muslim, secular, minority, minorities, Shashi, Tharoor, \
gay, lesbian, transgender, trans, reservation, abrahamic, godi""".lower()

right_terms = set(right.split(', '))
left_terms = set(left.split(', '))
len(right_terms), len(left_terms)

(62, 60)

In [6]:
list_right_terms = list(right_terms)
list_left_terms = list(left_terms)
full_terms = left_terms.union(right_terms)

In [7]:
def random_swap(term):
    if term in left_terms:
        # LW term being swapped
        return np.random.choice(list_right_terms)
    elif term in right_terms:
        # RW term being swapped
        return np.random.choice(list_left_terms)
    print(f"Error with: {term}")

def swap_terms(text):
    term_match = re.compile('|'.join([r'\b'+t for t in full_terms]))
    
    temp = term_match.sub(lambda m: random_swap(m.group()), text.lower())
    return temp

In [8]:
aug_records = []
for i, row in train_df.iterrows():
    aug_records.append({'clean_comment':row['clean_comment'].lower(), 'target':row['target']})
    # adding in record with swapped terms and opposite score
    if(row['target']>0.0):
        aug_records.append({'clean_comment':swap_terms(row['clean_comment']), 'target':-row['target']})
        # aug_records.append({'clean_comment':swap_terms(row['clean_comment']), 'target':-row['target']})
aug_df = pd.DataFrame.from_records(aug_records)
aug_df

Unnamed: 0,clean_comment,target
0,there are two types of parties those that prom...,0.00
1,"if fundamentals are sound then, faith can help...",0.75
2,"if fundamentals are sound then, faith can help...",-0.75
3,"contrary to popular belief, until modi, india ...",-0.50
4,bro inhone bahut land hijack kiya hai kisse pa...,-0.25
...,...,...
2512,if you had spent 5 min looking at the actual s...,0.00
2513,he can look at education. that is something he...,0.00
2514,it is good a initiative but the plastic in roa...,0.00
2515,"yup, been reading about it since couple of wee...",0.00


# TFIDF Embeddings

In [21]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), analyzer='word')

X_train = vectorizer.fit_transform(list(train_df['clean_comment']))
X_test = vectorizer.transform(list(eval_df['clean_comment']))
y_train, y_test = [np.sign(x)+1 for x in list(train_df['target'])], [np.sign(x)+1 for x in list(eval_df['target'])]

print(X_train.shape, X_test.shape)

(1999, 92418) (1000, 92418)


In [22]:
count = 0

new_voc = {}
for i in vectorizer.vocabulary_:
    if (i in left_terms) or (i in right_terms):
        new_voc[i] = count 
        count += 1 
print(new_voc)

{'order': 0, 'bjp': 1, 'aap': 2, 'congress': 3, 'shashi': 4, 'tharoor': 5, 'hindu': 6, 'modi': 7, 'hindutva': 8, 'rights': 9, 'caa': 10, 'fascist': 11, 'western': 12, 'trans': 13, 'nationalism': 14, 'christian': 15, 'religion': 16, 'freedom': 17, 'social': 18, 'muslim': 19, 'right wing': 20, 'minority': 21, 'amit': 22, 'shah': 23, 'bharatiya': 24, 'janata': 25, 'kejriwal': 26, 'nationalist': 27, 'ram': 28, 'secular': 29, 'upa': 30, 'gandhi': 31, 'nda': 32, 'rw': 33, 'duty': 34, 'cpi': 35, 'hindu rashtra': 36, 'pappu': 37, 'minorities': 38, 'equality': 39, 'reservation': 40, 'commies': 41, 'yogi': 42, 'liberal': 43, 'mandir': 44, 'yatra': 45, 'smriti irani': 46, 'conservative': 47, 'lgbtq': 48, 'nrc': 49, 'progressive': 50, 'adani': 51, 'hierarchy': 52, 'reaction': 53, 'rahul': 54, 'welfare': 55, 'gay': 56, 'brahmin': 57, 'rss': 58, 'progress': 59, 'left wing': 60, 'ramayan': 61, 'indira': 62, 'janta dal': 63, 'islamophobia': 64, 'sangh': 65, 'bhakt': 66, 'leftists': 67, 'aam': 68, 'aad

In [39]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), analyzer='word', vocabulary=new_voc)

x_train = vectorizer.fit_transform(list(train_df['clean_comment']))
x_val = vectorizer.transform(list(eval_df['clean_comment']))
y_train, y_val = [np.sign(x)+1 for x in list(train_df['target'])], [np.sign(x)+1 for x in list(eval_df['target'])]
x_train = x_train.toarray()
x_val = x_val.toarray()


print(x_train.shape, x_val.shape)

(1999, 97) (1000, 97)


# Transformers

In [40]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer

In [41]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [42]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [43]:
convert_text_to_seq = 0

In [44]:
if convert_text_to_seq:
    x_train = np.array(train_df['clean_comment'])
    y_train = np.array(train_df['true_label'])
    x_val = np.array(eval_df['clean_comment'])
    y_val = np.array(eval_df['true_label'])

    x_train_aug = np.array(aug_df['clean_comment'])
    y_train_aug= np.array(aug_df['target'])

    # Create a tokenizer
    tokenizer = Tokenizer()

    # Fit the tokenizer on your text
    # tokenizer.fit_on_texts(x_train)
    x_train = tokenizer.texts_to_sequences(x_train)
    x_val = tokenizer.texts_to_sequences(x_val)
    x_train_aug = tokenizer.texts_to_sequences(x_train_aug)

In [45]:
vocab_size = 20000  # Only consider the top 20k words
maxlen = 50  # Only consider the first 200 words of each movie review
# (x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)
print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)

1999 Training sequences
1000 Validation sequences


In [46]:
embed_dim = 32  # Embedding size for each token
num_heads = 8  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(3, activation="softmax")(x)
outputs = (outputs - 1) * 2

model = keras.Model(inputs=inputs, outputs=outputs)

In [47]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val)
)

ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {"<class 'numpy.float64'>"})