# Importing necessary libraries

### Importing the libraries needed for building the model.
  - numpy to do some numerical operations
  - pandas for cleaning and other processings needed by the data
  - matplotlib for visualizing if there's a need to
  - re[regular expressions for doing some cleaning on the text data itself:
    - to remove some prefix and suffixes
    - to remove emoji like comments
    - to remove @ keyword etc
  - bs4 to scrape the the website for the slangs used by Nigerians
  - requests needed for the url of websites needed for scraping


In [1]:
from collections import Counter
from itertools import combinations
import requests
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
import nltk.corpus
from nltk.corpus import stopwords
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Scraping process

In [2]:
url = "https://insight.ng/spice/nigerian-slangs-dictionary/"
req = requests.get(url)
soup = bs(req.text, "html.parser")
slangs = soup.find("ul", attrs = {"class": "ez-toc-list-level-3"} )

ConnectionError: HTTPSConnectionPool(host='insight.ng', port=443): Max retries exceeded with url: /spice/nigerian-slangs-dictionary/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000243D67CCF70>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

# Creating a list for slang from the first website.

In [None]:
slang_list = []
for slang in slangs.select('a'):
  slang_list.append(slang.text)
# print(slang_list)
len(slang_list)

In [None]:
new_slang_list = []
for word in slang_list:
  word = word.split('/')
  new_slang_list.extend(word)

In [None]:
cleaned_slang_list_1 = []
for word in new_slang_list:
  word = word.replace('\xa0', '')
  word =  word.replace('.','')
  word = word.replace(':','')
  word = re.sub("\d+", '',word)
  word = word.strip()
  cleaned_slang_list_1.append(word)

# Cleaning the texts gotten from the website for easier processing.

In [None]:
unknown = ['About Author','Latest entries','+']
for word in cleaned_slang_list_1:
  if word in unknown:
    cleaned_slang_list_1.remove(word)

### 58 Texts gotten from the first website after scraping.

In [None]:
len(cleaned_slang_list_1)

# Scraping process for the second website

In [None]:
slangs_2_list = []

url_2 = "https://www.skabash.com/popular-nigerian-slangs-and-their-meanings/"
req_2 = requests.get(url_2)
soup_2 = bs(req_2.text, "html.parser")
slangs_2 = soup_2.find("div", attrs={"class":"lwptoc_items lwptoc_items-visible"})

for element in slangs_2.find("div").select("div"):
  slangs_2_list.append(element.find('a').text)

In [None]:
new_slang_list_2 = []

for slang in slangs_2_list:
  slang = slang.replace('\n', '')
  slang = re.sub('\d','',slang)
  slang = slang.replace('.', '')
  new_slang_list_2.append(slang)

In [None]:
new_slang_list_2.remove("Popular Nigerian slangs that are trending")

In [None]:
new_slang_list_2 = list(set(new_slang_list_2))

cleaned_slang_list_2 = []
for word in new_slang_list_2:
  word = word.split('/')
  cleaned_slang_list_2.extend(word)

### 33 slangs gotten from the second website

In [None]:
len(cleaned_slang_list_2)

### Combining the two slang lists into a single list by mere list addition

In [None]:
long_list = cleaned_slang_list_1 + cleaned_slang_list_2
len(long_list)

In [None]:
short_list = ["really very good", "very great and nice", "pretty nice",
              "God bless", "pretty nice", "better things", "Lord is good", "Bright and beautiful", "going well",
             "soft life", "good news", "greater things"]

# SCRAPING THE **TWITTER** WEB IN SEARCH OF THE TWEETS THAT CONTAINS THE ABOVE WORDS USING __SNSCRAPE__ LIBRARY

In [None]:
!pip install snscrape

In [None]:
import snscrape.modules.twitter as snstwitter

# THE SCRAPING PROCESS

  - The list was gotten by specifying the country's location via the coordinates as shown with the variable loc.

In [None]:
# loc = '9.077751, 8.6774567, 100km'
# tweet_list = []
# for word in long_list:
#   for i, item in enumerate(snstwitter.TwitterSearchScraper('{} geocode:"{}"'.format(word, loc)).get_items()):
#     if i > 150:
#       break
#     tweet_list.append([item.content, item.likeCount, item.user.location])
# df = pd.DataFrame(tweet_list, columns = ["tweets", "likes", "location"])
# df.to_csv("slang_tweets_1.csv")

In [None]:
# loc = '9.077751, 8.6774567, 100km'
# tweet_list = []
# for word in short_list:
#   for i, item in enumerate(snstwitter.TwitterSearchScraper('{} geocode:"{}"'.format(word, loc)).get_items()):
#     if i > 150:
#       break
#     tweet_list.append([item.content, item.likeCount, item.user.location])
# df_short = pd.DataFrame(tweet_list, columns = ["tweets", "likes", "location"])
# df_short.to_csv("slang_tweets_2.csv")

In [None]:
df = pd.read_csv("slang_tweets_1.csv")
df_short = pd.read_csv("slang_tweets_2.csv")
df_short.head(20)

In [None]:
df_short['label'] = 'not vulgar'
df['label'] = 'vulgar'

In [None]:
df_total = pd.concat([df, df_short], axis=0, ignore_index=True)

In [None]:
df_total

### Making a copy of the dataset

In [None]:
df_1 = df_total.copy()

# **CLEANING THE TEXT DATA**
### The Text data needs to be cleaned before being fed into the model for training and testing.
    - Normalizing 
    - Remove Unicode Characters
    - Remove Stopwords
    - Perform Stemming
    - Lemmatization

**Normalizing the texts..**
  - changing all to lowercases also known as _case normalization_
    - creating a function to used for changing the case to a lowercase.

In [None]:
def lowercase(text):
  text = text.lower()
  return text

**Removing Unicode Characters**
  - Creating a function to remove unicode characters
    - This function uses regular expression library

In [None]:
def unicode_removal(text):
  text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
  return text

# **STOP WORDS**
#### Stopwords are the most common words in any natural language. For the purpose of analyzing text data and building NLP models, these stopwords might not add much value to the meaning of the document. Generally, the most common words used in a text are “the”, “is”, “in”, “for”, “where”, “when”, “to”, “at” etc. 

**Removing Stopwords**
  - Creating a function to remove the stop words
    - This function uses the Natural Language ToolKit library 

In [None]:
# downloading the stopwords needed
nltk.download('stopwords')

def stopwords_removal(text):
  stop = stopwords.words('english')
  text = " ".join([word for word in text.split() if word not in (stop)])
  return text

# Stemming

#### Stemming nvolves grouping of words by their root stem. This makes it clear or helps recognize that ‘jumping’ ‘jumps’ and ‘jumped’ are all rooted to the same verb (jump) and thus are referring to similar problems.
  - Creating a function to stem the texts.

In [None]:
def stemming(text):
  stemmer = PorterStemmer()
  text = " ".join([stemmer.stem(word) for word in text.split()])
  return text

# Lemmatization

#### Lemmatization groups words based on root definition, and helps to differentiate between present, past, and indefinite.

#### In order words, ‘jumps’ and ‘jump’ are grouped into the present ‘jump’, as different from all uses of ‘jumped’ which are grouped together as past tense, and all instances of ‘jumping’ which are grouped together as the indefinite (meaning continuing/continuous).

In [None]:
nltk.download('wordnet')
def lemmatize(text):
  lemmatizer = WordNetLemmatizer()
  text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
  return text

### Selecting the useful feature we need to train the model.

In [None]:
x = df_1['tweets']
y = df_1['label']

### Passing the text one by one through each of the functions created above using the lambda function method.

In [None]:
x = x.apply(lambda x: lowercase(x))
x = x.apply(lambda x: unicode_removal(x))
x = x.apply(lambda x: stopwords_removal(x))
x = x.apply(lambda x: stemming(x))
x = x.apply(lambda x: lemmatize(x))

In [None]:
y = y.apply(lambda y:'bullying' if y == 'vulgar' else 'non-bullying')

### Splitting the dataset into training and testing sets using the train test split...

In [None]:
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.3, random_state=40)

### Using Count vectorizer to vectorize the preprocessed texts

In [None]:
vectorizer =  CountVectorizer()
x_train_vect = vectorizer.fit_transform(x_train)

In [None]:
model = GaussianNB()
model.fit(x_train_vect.toarray(),y_train)

### Testing the model

In [None]:
x_test_vect = vectorizer.transform(x_test)

In [None]:
y_pred = model.predict(x_test_vect.toarray())

## Checking the accuracy score
###    - f1 score
###    - precision
###    - recall
###    - and the _confusion matrix_

In [None]:
print("accuracy score:", accuracy_score(y_test, y_pred))

In [None]:
print('classification reports:\n',classification_report(y_test,model.predict(x_test_vect.toarray())))

##### The model has 84% accuracy

### Saving the model as a pickle file

In [None]:
# saving our decision tree model using pickle

import pickle

with open('tweet.pkl','wb') as myfile:
    pickle.dump(model,myfile)

with open('tweet.pkl','rb') as myfile:
    model = pickle.load(myfile)

# Saving our vectorizer
with open("vectorizer.pkl", "wb") as vect:
    pickle.dump(vectorizer, vect)

## TESTING

#### 1 - Vulgar
#### 0 - Non-Vulgar

In [None]:
text = ["sapa is not nice o na why you dey do like mad man", "This is a really wonderful news. more wins bro"]
text_vect = vectorizer.transform(text)

In [None]:
model.predict(text_vect.toarray())