## **Libraries**

In [2]:
#Basic Libraries
import re
import nltk
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#NLTK libraries
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

#Feature extraction process
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [47]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## **Dataset**

In [4]:
train = pd.read_csv('NLP/Train.csv')
test = pd.read_csv('NLP/Test.csv')
SampleSubmission = pd.read_csv('NLP/SampleSubmission.csv')

## **Dataset Exploration**

In [6]:
print(f"The shape of the Train data is : {train.shape}")
print(f"The shape of the Test data is : {test.shape}")
print(f"The shape of the SampleSubmission data is : {SampleSubmission.shape}")

The shape of the Train data is : (70000, 3)
The shape of the Test data is : (30000, 2)
The shape of the SampleSubmission data is : (30000, 2)


In [8]:
train.head(3)

Unnamed: 0,ID,text,label
0,13P0QT0,3sbaaaaaaaaaaaaaaaaaaaa lek ou le seim riahi o...,-1
1,SKCLXCJ,cha3eb fey9elkoum menghir ta7ayoul ou kressi,-1
2,V1TVXIJ,bereau degage nathef ya slim walahi ya7chiw fi...,-1


In [9]:
test.head(3)

Unnamed: 0,ID,text
0,2DDHQW9,barcha aaindou fiha hak w barcha teflim kadhalik
1,5HY6UEY,ye gernabou ye 9a7ba
2,ATNVUJX,saber w barra rabbi m3ak 5ouya


In [10]:
SampleSubmission.head(3)

Unnamed: 0,ID,label
0,2DDHQW9,0
1,5HY6UEY,0
2,ATNVUJX,0


## **Exploratory Data Analysis**

In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      70000 non-null  object
 1   text    70000 non-null  object
 2   label   70000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.6+ MB


In [11]:
train['label'].unique()

array([-1,  1,  0])

In [74]:
train['label'].value_counts()

 1    38239
-1    29295
 0     2466
Name: label, dtype: int64

In [26]:
#Checking the dataset randomly
train['text'][np.random.randint(0, len(train))]

'raj3elna moudarebna sabe9 chiheb lili hadhi mtalebna tawa'

In [25]:
train.isnull().sum().sum(), test.isnull().sum().sum()

(0, 0)

## **Dataset Preprocessing**

In [29]:
punctuations = string.punctuation

### **Bag of words**

In [354]:
sentiments = [x for x in train['text'].values]

In [355]:
stemmer = PorterStemmer()

In [356]:
def Bag_of_words(sentiment_list):
  Bag_of_Words = []
  for sentiment in sentiment_list:
    words = nltk.word_tokenize(sentiment)
    for word in words:
      Bag_of_Words.append(stemmer.stem(word))#Stemming the words before appending
  return Bag_of_Words

In [357]:
BagofWords = Bag_of_words(sentiments)

In [358]:
print(f"The Bag of word has a total of {len(BagofWords)} words with {len(set(BagofWords))} unique words")

The Bag of word has a total of 688634 words with 138481 unique words


In [359]:
#Selecting the unique words from the bag of words
BagofWords = list(set(BagofWords))
#Removing stopwords
BagofWords = [x for x in BagofWords if x not in punctuations]

### **Sentiments based BOW**

In [361]:
negative_sentiment = [sentiments for sentiments in train['text'][train['label'].eq(-1)]]
positive_sentiment = [sentiments for sentiments in train['text'][train['label'].eq(1)]]
neutral_sentiment = [sentiments for sentiments in train['text'][train['label'].eq(0)]]

negative_BOW = Bag_of_words(negative_sentiment)
positive_BOW = Bag_of_words(positive_sentiment)
neutral_BOW = Bag_of_words(neutral_sentiment)

In [242]:
#[words for words in BagofWords if words[0] == 'h']
#hah = [words for words in BagofWords if words[0:3] in ['haa']]

## Trying Out some Regex

#### hah

In [186]:
hah = [words for words in BagofWords if words[0:3] in ['hah']]
hah_words = ' '.join(w for w in hah)
hah_words

'hahahahahhaahahhahahahah hahhahha hahahahahhaha hahahahhahahaha hahhhhhhhhhhhhhhhhhhhhhya hahahahahahahahah hahahahahahahahahah hahahaaaa hahahahahahaaaaaaaa hahahahhahahahhhhhhhhhaaaaahahaahahahhahahahhhahahahahahaahahahahgahaahahahaahhahahhhhhhh hahahahhhhhhhhhhhhh hahhahahahah hahahahahahah hahahaah hahahahahahaahhaha hahaah hahahahahahhaha hahhaa hahahahahah hahahahahaha hahahaha hahahahahahahah hahahhhhhh hahahahahaihohaw hahahah hahhhhhhhh hahhahahha hahom hahahahahahahaha hahhaah hahahahah hahahahhaaha haho hahahahahahahaaa hahahahahahahahahahahahahahhahaha hahahahahahahha hahhhhhhhhh hahda hahouwa hahahh hahi hahahahahahahahahahahahahahah hahahahhahah hahahahahhhh hahoma hahla hahahahahahahahahhhhhh hahahahahh hahahahahhahahahahhahahahaha hahahaaaahah hahouu hahahahahahaaaaaaaaa hahahahahahahahaha hahahhhhhhhhhhh hahahahahahaahahahaa hahahahahhah hahaaa hahah hahahahhahahahahha haha hahahahhahahah hahahahahahahahaaaaaaa hah hahahhahahha hahahahahahahahahahahahahahahahhahahahah

In [188]:
re.sub(r'h+[a|e|h|i|o]+h*[a|e|h|i|o]*[a|e|h]*', 'haa', hah_words)

'haa haa haa haa haaya haa haa haa haa haagahaa haa haa haa haa haa haa haa haa haa haa haa haa haa haaw haa haa haa haam haa haa haa haa haa haa haa haa haa haada haauwa haa haa haa haa haa haama haala haa haa haa haa haauu haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haau haa haa haau haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haaw haa haa haa haa haa haa haaz haa haa haa haa haa haawa haa haa haa'

In [204]:
re.sub(r'h+a[h|a]*', 'haa', hah_words)

'haa haa haa haa haaya haa haa haa haa haagahaa haa haa haa haa haa haa haa haa haa haa haa haa haa haaihohaaw haa haa haa haaom haa haa haa haa haao haa haa haa haa haada haaouwa haa haai haa haa haa haaoma haala haa haa haa haa haaouu haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haa haaoou haa haai haaou haa haa haa haa haa haa haa haa haa haa haa haa haa haaia haa haa haa haa haa haa haa haa haaoo haa haa haa haaw haa haa haa haa haa haa haaz haa haa haa haa haa haawa haa haa haa'

### hha

In [235]:
hha = [words for words in BagofWords if words[0:3] in ['hha']]
hha_words = ' '.join(w for w in hha)
hha_words

'hhahahhahahahhah hhahahahaha hhaha hhahahhahahahahah hhahhahahaha hhaahaaahh hhahha hhahahahahahhahahhahahahhahhhahhahahahhahahahahahhahaha hhaw hhahahhahahh'

In [236]:
re.sub(r'h+a[h|a]*', 'haa', hha_words)

'haa haa haa haa haa haa haa haa haaw haa'

### hhh

In [225]:
hhh = [words for words in BagofWords if words[0:3] in ['hhh']]
hhh_words = ' '.join(w for w in hhh)
hhh_words

'hhhhhhhhhj hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhwoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooh hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhiiiiiihihihihihiihihihihiihhoohohohohohohoohohohahahahahahahahahahahahehehehehehehehehh hhhhhhhhhvhhhhhhhh hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhjjhh hhhhhhhhhhha hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh hhhhaaa9999999 hhhhhmala hhhhhhhhhhhh hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh hhhha hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh hhhhhhmenchi5o hhhhhtaw hhhhhhhhhhhhhhh hhhhhhahhhhha hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh hhhhhhhhhhhhhh hhhhhhhhhhhhhhhhhhhhhh

In [226]:
re.sub(r'hhh+', 'hh', hhh_words)

'hhj hhwoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooh hhiiiiiihihihihihiihihihihiihhoohohohohohohoohohohahahahahahahahahahahahehehehehehehehehh hhvhh hhjjhh hha hh hhaaa9999999 hhmala hh hh hh hha hh hhmenchi5o hhtaw hh hhahha hh hh hh hhwanta hha hh hh hh hhj hhkol hhjhhjjhhggghvhh hh hhghghhg hh hhahha hh hhest hhrabi hhkkkjjjhhjjjjjjjjjhhjjjh hh hh hhaaahh hhlkolhom hhj hh hh hh hh hh hhsaha hh hh hhaaaaaaahh hh hh hh hhghh hhncahh hhmala hhg hh hh hhrabi hhb hh hh hha hha9wa hh hh hhj hhohi hhnh hh hhj hh hhvhv hh hhjjh hh hh hh hh hh hh hh hhb hhahahaha hhjjhh hhnh hhg hh hh hhjhjj hhjj hhnon hhwlhi hh hh hhobbeek hh hh hh hh hh hhaaaaaahh hh0 hh hh hh hh hh hh hh hh hhv hh hhjjjj hhena hhsi hhjjh hhamm hhloooooooolmdrrrrrrrrrrrrr hhxd hh hhjj hhaaaaaaaa hhbh hh hh3 hh hh hhahahahah hh hhyhh hhfourat hh hh hh hh hhg hh hh hh hhkkkkkkkll hh'

In [234]:
re.sub(r'he+h*[h|e]*', 'heh', hhh_words)#heh

'hhhhhhhhhj hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhwoooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooh hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhiiiiiihihihihihiihihihihiihhoohohohohohohoohohohahahahahahahahahahahaheh hhhhhhhhhvhhhhhhhh hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhjjhh hhhhhhhhhhha hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh hhhhaaa9999999 hhhhhmala hhhhhhhhhhhh hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh hhhha hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh hhhhhhmenchi5o hhhhhtaw hhhhhhhhhhhhhhh hhhhhhahhhhha hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh hhhhhhhhhhhhhh hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh

### heh

In [220]:
heh = [words for words in BagofWords if words[0:3] in ['heh']]
heh_words = ' '.join(w for w in heh)
heh_words

'hehdik heha hehehe hehii hehdom heheheheheh heheheheh heh hehe hehi hehdiiii hehoude hehom hehda heheehehehhehehe'

In [221]:
re.sub(r'he*h*[h|e]*', 'heh', heh_words)

'hehdik heha heh hehii hehdom heh heh heh heh hehi hehdiiii hehoude hehom hehda heh'

### haa

In [181]:
haa = [words for words in BagofWords if words[0:3] in ['haa']]
haa_words = ' '.join(w for w in haa)
haa_words

'haaw haaaaaaaaaaaaahaha haazetkomm haakaaa haal haa haakk haaka haaaaj haaaaaaahaaaaaahaaaaahaaaa haaq haand haahah haaj haaay haahhaha haazii haaah haak haaaaaaaaaaa haaaa haaha haaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaj haaaw haaaaaaaaa haaaaou haaaaaaaaaa333333 haaa haaaaaaaha haaaaaa haaad haazina haah haaas haaaaa haaahh haahhahah haaaah haaaaaaaamdoullaaaaaaaaaaaah haaahaa haaalb haahhahahhhhaahhahh haaaana haahha haaaaha haamv haaaj'

In [218]:
re.sub(r'he*h*[h|e]*', 'heh', haa_words)#heh

'hehaaw hehaaaaaaaaaaaaahehaheha hehaazetkomm hehaakaaa hehaal hehaa hehaakk hehaaka hehaaaaj hehaaaaaaahehaaaaaahehaaaaahehaaaa hehaaq hehaand hehaahehaheh hehaaj hehaaay hehaahehaheha hehaazii hehaaaheh hehaak hehaaaaaaaaaaa hehaaaa hehaaheha hehaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaj hehaaaw hehaaaaaaaaa hehaaaaou hehaaaaaaaaaa333333 hehaaa hehaaaaaaaheha hehaaaaaa hehaaad hehaazina hehaaheh hehaaas hehaaaaa hehaaaheh hehaahehahehaheh hehaaaaheh hehaaaaaaaamdoullaaaaaaaaaaaaheh hehaaahehaa hehaaalb hehaahehahehahehaahehaheh hehaaaana hehaaheha hehaaaaheha hehaamv hehaaaj'

In [219]:
re.sub(r'hhh+', 'hh', haa_words)#hhh

'haaw haaaaaaaaaaaaahaha haazetkomm haakaaa haal haa haakk haaka haaaaj haaaaaaahaaaaaahaaaaahaaaa haaq haand haahah haaj haaay haahhaha haazii haaah haak haaaaaaaaaaa haaaa haaha haaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaj haaaw haaaaaaaaa haaaaou haaaaaaaaaa333333 haaa haaaaaaaha haaaaaa haaad haazina haah haaas haaaaa haaahh haahhahah haaaah haaaaaaaamdoullaaaaaaaaaaaah haaahaa haaalb haahhahahhaahhahh haaaana haahha haaaaha haamv haaaj'

## Regex

In [266]:
def sentence_clean(sentence):
  sentence_clean = re.sub(r'3sba+','3sba',sentence)
  sentence_clean = re.sub(r'ha*h*\w*j+','haj', sentence_clean) #haj
  sentence_clean = re.sub(r'hhh*', 'hh', sentence_clean) #hhh
  sentence_clean = re.sub(r'h+a[h|a]*', 'haa', sentence_clean) #hha haa hah
  sentence_clean = re.sub(r'he+h*[h|e]*', 'heh', sentence_clean) #heh
  sentence_clean = re.sub(r'bra+vo+', 'bravo', sentence_clean) #heh
  sentence_clean = re.sub(r'caa+', 'caa', sentence_clean) #heh
  sentence_clean = re.sub(r'cc*a+', 'ccaa', sentence_clean)
  sentence_clean = re.sub(r'cc{1,3}', 'cc', sentence_clean)
  sentence_clean = re.sub(r'cpr+', 'cpr', sentence_clean)
  sentence_clean = re.sub(r'ctt*', 'ct', sentence_clean)
  sentence_clean = re.sub(r'kkkk*', 'kkk', sentence_clean)
  return sentence_clean

In [386]:
[words for words in BagofWords if words[0] == 'l']

['lettejamo3',
 'lefriq',
 'lmouchkla',
 'lafriqu',
 'likheir',
 'ldandouk',
 'luminati',
 'l3aylat',
 'lfadha7',
 'li5dem',
 'li7y',
 'lbag',
 'loussa55',
 'lekthor',
 'lemmaliha',
 'lefrii9iii',
 'lesgayratek',
 'lmafr5a',
 'lel2bad',
 'ltoun',
 'lbaxar',
 'li3malto',
 "l'espéranc",
 'lamdtou',
 'lowle',
 'lndamek',
 'lkhawa',
 'l7olm',
 'lifeh',
 'lahiyin',
 'lighnah',
 'lclubbbbbbbbb',
 'lfar7a',
 'leurop',
 'lotf',
 'lpro',
 'la3éd',
 'lkhrya',
 'lsouria',
 'lebya',
 'l3onsori',
 'lkhayr',
 'liitaal3',
 'lclùb',
 'lgalya',
 'lass3ad',
 'lwaldin',
 'lmoslmin',
 'lhond',
 'lemlabya',
 'l2slamia',
 'lemair',
 'lketib',
 'lbaji',
 'lakab',
 'liiiih',
 'lfcb',
 'l9arawen',
 'labsha',
 'lmochkel',
 'lerhab',
 'layem',
 'lomm',
 'lehwa',
 'lgalbbbbbbbb',
 'lpitrol',
 'lokher',
 'lgitou',
 'l9yt',
 'lgafsaaaa',
 'luv',
 'lfhom',
 'littih',
 'l7eeeee',
 'lmakyaj',
 'lekwanjia',
 'lelli',
 'lazemk',
 'la3sab',
 'l3ezz',
 'leekin',
 'lhase',
 'ldfina',
 'lhw',
 'lamn',
 'la3lik',
 'lootf',
 

In [366]:
caa = ' '.join(words for words in BagofWords if words[0:2] == 'ct')

In [368]:
re.sub(r'ctt*', 'ct', caa)

'ct cton ct ct ctn ctair'