# <center>Mini-datathon for Text classification on twitter data

## <center>Team name: DATA GLADIATORS
    
<center> Members: Suvansh Vaid, Saurav Gupta, Mohit Gupta
    
    
    
## <center>Competition link: https://www.kaggle.com/c/mdss-basic-stream/overview

## Importing libraries

In [349]:
import pandas as pd
import nltk
from nltk.collocations import *
from itertools import chain
import itertools
from nltk.tokenize import RegexpTokenizer
from nltk.probability import *
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import MWETokenizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.ensemble import RandomForestClassifier

#nltk.download('stopwords')
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

## Reading files

In [332]:
train = pd.read_csv('train_basic.csv')
test = pd.read_csv('test_basic.csv')

In [333]:
train.head()

Unnamed: 0,tweet,label
0,silencing blm : priti patel\xe2\x80\x99s anti-...,BLM
1,"trillian42_ johnbok5 nadiawhittomemp ""\'silly ...",BLM
2,"rt errolwebber: tell me, would this be conside...",BLM
3,apple won't let parler have an app but still k...,BLM
4,malika_andrews wojespn can we get jlm trending...,BLM


In [334]:
train.shape

(26400, 2)

In [335]:
train.dropna(inplace=True)

In [336]:
train.shape

(24329, 2)

In [337]:
# Idntifying labels
train['label'].value_counts()

Covid    8692
BLM      4990
Riots    4100
Trump    3333
Biden    3214
Name: label, dtype: int64

## Pre-processing data

In [318]:
def pre_process(df):
    
    df['tweet'] = df['tweet'].astype(str)
    
    # Dealing with newline characters
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join(x.split('\n')))
    
    # Cleaning the urls
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', '', x)) 

    # Cleaning the html elements
    df['tweet'] = df['tweet'].apply(lambda x: re.sub(r'<.*?>', '', x)) 
    
    # tokenize tweets
    tokenizer = RegexpTokenizer(r"[a-zA-Z]+(?:[-'][a-zA-Z]+)?") #rule based tokenisation               
    df['tokens'] = df['tweet'].apply(lambda x : tokenizer.tokenize(x.strip().lower()))
    
    # remove stopwords
    stop_words = set(stopwords.words('english')) 
    df['tokens'] = df['tokens'].apply(lambda x: [y for y in x if y not in stop_words])
    
    # stemming
    stemmer = PorterStemmer() #here porter stemmer is used because it has a lower error rate and is more efficient
    df['tokens'] = df['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])
    
    # lemmatizing
    #lemmatizer = WordNetLemmatizer() 
    #df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
    
    # Removing tokens of length less than 3
    df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if len(word) > 2])
    
    # Converting list of tokens to sentences
    df['tokens'] = df['tokens'].apply(lambda x: ' '.join(x))
        
    return df

In [339]:
# Pre processing the training data
train_processed = pre_process(train)

In [340]:
train_processed.head()

Unnamed: 0,tweet,label,tokens
0,silencing blm : priti patel\xe2\x80\x99s anti-...,BLM,silenc blm priti patel xe2 x80 x99 anti-protes...
1,"trillian42_ johnbok5 nadiawhittomemp ""\'silly ...",BLM,trillian 42_ johnbok nadiawhittomemp silli lit...
2,"rt errolwebber: tell me, would this be conside...",BLM,errolwebb tell would consid racist xe2 x80 x9d...
3,apple won't let parler have an app but still k...,BLM,appl let parler app still keep twitter allow m...
4,malika_andrews wojespn can we get jlm trending...,BLM,malika_andrew wojespn get jlm trend mayb nba p...


In [321]:
# creating count vectorizer
#vectorizer = CountVectorizer()
#text_tf = vectorizer.fit_transform(train_processed.tokens).toarray()

In [341]:
# creating tfidf vectorizer
tf = TfidfVectorizer(use_idf = True)
text_tf = tf.fit_transform(train_processed.tokens).toarray()

In [342]:
text_tf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [343]:
y = train['label']
print (y.value_counts())

Covid    8692
BLM      4990
Riots    4100
Trump    3333
Biden    3214
Name: label, dtype: int64


In [344]:
# Preprocessing test data
test_processed = pre_process(test)

In [345]:
test_processed.head()

Unnamed: 0,Train_id,tweet,tokens
0,1,congratulations rrhdr and publichealthumn!! \x...,congratul rrhdr publichealthumn xf0 x9f x91 x8...
1,2,the same people who perpetrated the whitesupre...,peopl perpetr whitesupremaci domesticterrorist...
2,3,fannie lou hamer\n\nblackhistory\nblackhistory...,fanni lou hamer nblackhistori nblackhistorymon...
3,4,kylandyoung williamcson $90 million properly d...,kylandyoung williamcson million properli distr...
4,5,its the colors for the month for me.\n.\nblack...,color month nblacklivesmatt nthat' that post n...


In [346]:
# creating tfidf for test data
test_tf = tf.transform(test_processed.tokens).toarray()

## Training the model and making predictions on test data

In [347]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=0, n_estimators= 100)
rf.fit(text_tf, y)
y_pred = rf.predict(test_tf)

In [348]:
submission = pd.DataFrame.from_dict({
    'Train_id': test.Train_id,
    'label': y_pred
})

submission.to_csv('submission.csv', index=False)