In [245]:
import pandas as pd
import numpy as np

#### Read the csv using pandas

In [246]:
df = pd.read_csv('blogtext.csv')

In [247]:
df.head() # sample dataset lookup

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [248]:
df.shape #shape of the dataset

(681284, 7)

In [249]:
df.dtypes # datatypes of dataset

id         int64
gender    object
age        int64
topic     object
sign      object
date      object
text      object
dtype: object

#### Check if there is any null value, and get the total count.

In [250]:
df.isnull().sum() #no nnull values

id        0
gender    0
age       0
topic     0
sign      0
date      0
text      0
dtype: int64

In [251]:
df = df.head(10000) #let consider the first 3000 row data for model development 

#### Data Pre Processing

- Remove unwanted characters
- Convert text to lowercase
- Remove unwanted spaces
- Remove stopwords

In [252]:
df_clean = df.copy()

In [253]:
df_clean

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...
...,...,...,...,...,...,...,...
9995,1705136,female,25,indUnk,Pisces,"19,May,2004",take me home with you forever where I ...
9996,1705136,female,25,indUnk,Pisces,"23,June,2004",seductive secretness behind doors warn...
9997,1705136,female,25,indUnk,Pisces,"21,June,2004",For being so kind to me when I need yo...
9998,1705136,female,25,indUnk,Pisces,"09,June,2004",blurry outside sounds as people mingle...


In [254]:
# Select only alphabets
import re
df_clean['text_clean'] = df_clean.text.apply(lambda x: re.sub('[^A-Za-z]+', ' ', x))

In [255]:
# Convert text to lowercase
df_clean['text_clean'] = df_clean.text_clean.apply(lambda x: x.lower())

In [256]:
# Strip unwanted spaces
df_clean['text_clean'] = df_clean.text_clean.apply(lambda x: x.strip())

In [257]:
# Remove stopwords
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
df_clean['text_clean'] = df_clean.text_clean.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))

In [258]:
 df_clean[['text','text_clean']]

Unnamed: 0,text,text_clean
0,"Info has been found (+/- 100 pages,...",info found pages mb pdf files wait untill team...
1,These are the team members: Drewe...,team members drewes van der laag urllink mail ...
2,In het kader van kernfusie op aarde...,het kader van kernfusie op aarde maak je eigen...
3,testing!!! testing!!!,testing testing
4,Thanks to Yahoo!'s Toolbar I can ...,thanks yahoo toolbar capture urls popups means...
...,...,...
9995,take me home with you forever where I ...,take home forever may rest sleep arms forgotte...
9996,seductive secretness behind doors warn...,seductive secretness behind doors warning neve...
9997,For being so kind to me when I need yo...,kind need holding hand petting hair cry bring ...
9998,blurry outside sounds as people mingle...,blurry outside sounds people mingle pass darkn...


**From the above comparsion we have clean the data in every aspects**

In [259]:
df_clean.drop('text',axis='columns', inplace=True)

In [260]:
df = df_clean

In [261]:
df.text_clean[10] #clean data

'ah korean language looks difficult first figure read hanguel korea surprisingly easy learn alphabet characters seems easy vocabulary starts oh backwards us sentence structure yikes luckily many options us slow witted foreigners take language course could list urllink joongang article says lot resources urllink well guy motivation jeon ji hyun latest something actually star movies cfs hear means commercial feature positive saw latest movie sunday night hard describe name english version windstruck korean version yeochinso short ne yeojachingu rul sogayhamnida like introduce girlfriend surprisingly titles make sense like website korean english looks quite good actually urllink movie shown theatres subtitles special times info urllink list many theatres seoul click urllink urllink great reason learn korean already married went foreigners well local korean national course korean take picture put urllink movie hof bar update bud mine passed urllink link giordano ad apparently aired korea n

#### Merge the other columns to have dependant and independent variables

In [262]:
df['labels'] = df.apply(lambda row: [row['gender'], row['topic']], axis=1) #Since we need to identify the author
#we are considering the gender and topic columns and mergin into labels.


In [263]:
df = df[['text_clean','labels']]

In [264]:
df.head() #final dataset for model building

Unnamed: 0,text_clean,labels
0,info found pages mb pdf files wait untill team...,"[male, Student]"
1,team members drewes van der laag urllink mail ...,"[male, Student]"
2,het kader van kernfusie op aarde maak je eigen...,"[male, Student]"
3,testing testing,"[male, Student]"
4,thanks yahoo toolbar capture urls popups means...,"[male, InvestmentBanking]"


## Create training and testing data

In [265]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.text_clean.values, df.labels.values, test_size=0.20, random_state=42)

In [266]:
X_train[0]

'boston behalf great state illinois crossroads nation land lincoln let express deep gratitude privilege addressing convention tonight particular honor let face presence stage pretty unlikely father foreign student born raised small village kenya grew herding goats went school tin roof shack father grandfather cook domestic servant grandfather larger dreams son hard work perseverance father got scholarship study magical place america stood beacon freedom opportunity many come studying father met mother born town side world kansas father worked oil rigs farms depression day pearl harbor signed duty joined patton army marched across europe back home grandmother raised baby went work bomber assembly line war studied gi bill bought house fha moved west search opportunity big dreams daughter common dream born two continents parents shared improbable love shared abiding faith possibilities nation would give african name barack blessed believing tolerant america name barrier success imagined g

## Vectorize the data

### Create Bag of Words
- Use CountVectorizer
- Transform the traing and testing data

In [267]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [268]:
vectorizer.get_feature_names_out()[:10]

array(['aa', 'aa amazing', 'aa anger', 'aa compared', 'aa keeps',
       'aa nice', 'aa sd', 'aaa', 'aaa come', 'aaa discount'],
      dtype=object)

In [269]:
# X_train_bow.toarray()

### Create a dictionary to get label counts

In [270]:
label_counts = dict()

for labels in df.labels.values:
    for label in labels:
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1

In [271]:
label_counts

{'male': 5916,
 'Student': 1137,
 'InvestmentBanking': 70,
 'female': 4084,
 'indUnk': 3287,
 'Non-Profit': 71,
 'Banking': 16,
 'Education': 270,
 'Engineering': 127,
 'Science': 63,
 'Communications-Media': 99,
 'BusinessServices': 91,
 'Sports-Recreation': 80,
 'Arts': 45,
 'Internet': 118,
 'Museums-Libraries': 17,
 'Accounting': 4,
 'Technology': 2654,
 'Law': 11,
 'Consulting': 21,
 'Automotive': 14,
 'Religion': 9,
 'Fashion': 1622,
 'Publishing': 4,
 'Marketing': 156,
 'LawEnforcement-Security': 10,
 'HumanResources': 2,
 'Telecommunications': 2}

## Multi label binarizer

Load a multilabel binarizer and fit it on the labels.

In [272]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=sorted(label_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_test = mlb.transform(y_test)

In [273]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver='lbfgs')
clf = OneVsRestClassifier(clf)

### Fit the classifier

In [274]:
clf.fit(X_train_bow, y_train)

OneVsRestClassifier(estimator=LogisticRegression())

### Prediction 

In [275]:
predicted_labels = clf.predict(X_test_bow)
predicted_scores = clf.decision_function(X_test_bow)

### Get inverse transform for predicted labels and test labels

In [276]:
pred_inversed = mlb.inverse_transform(predicted_labels)
y_test_inversed = mlb.inverse_transform(y_test)

### Print some samples prediction 

In [277]:
for i in range(5):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test[i],
        ','.join(y_test_inversed[i]),
        ','.join(pred_inversed[i])
    ))

Title:	thanks stopping blog goal site place mostly vent corp bullshit tell stupid drunken stories host various ramblings peace eric
True labels:	Consulting,male
Predicted labels:	male


Title:	morning kinda forgot football today surpising since like every day except sunday next months something forgot football also forgot time football unbearable state affairs assure attempted mad dash stairs brought startlingly short fact legs want listen decided listen someone else someone completely unrelated also quite invisible told time would much better spent spasming legs want sorta quiver floor instead actually sprinting bodily mutany forced rely upon trusty much loyal arms transportation basement lair went well came door ever tried opening door top stairs without able use legs quite tricky end propped leaned hard managed stroke handle enough somehow pop open victory last one small problem football schedule fridge high fridge normally eye level towering frame eyes reduced midgethood ground sma

**Model got predicited pretty good**

### Classification Report

In [290]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score,log_loss

def print_evaluation_scores(y_val, predicted):
    print('Accuracy score: ', accuracy_score(y_val, predicted))
    print('F1 score: ', f1_score(y_val, predicted, average='micro'))
    print('Average precision score: ', average_precision_score(y_val, predicted, average='micro'))
    print('Average recall score: ', recall_score(y_val, predicted, average='micro'))
#     print('Log-loss score: ', log_loss(y_val, predicted))
    

In [303]:
print('\033[1m\033[4mBag-of-words\033[4m\033[0m')
print_evaluation_scores(y_test, predicted_labels)

[1m[4mBag-of-words[4m[0m
Accuracy score:  0.4405
F1 score:  0.7114337568058076
Average precision score:  0.5390730545142496
Average recall score:  0.637


## Experimenting with other vectorizer 
## TF-IDF

In [280]:
from sklearn.feature_extraction.text import TfidfVectorizer          #For TF-IDF
vectorizer_tf = TfidfVectorizer(max_features=5000)
X_train_tf = vectorizer_tf.fit_transform(X_train)
X_test_tf = vectorizer_tf.transform(X_test)

In [281]:
vectorizer_tf.get_feature_names_out()[:10]

array(['aaron', 'ability', 'able', 'absence', 'absolute', 'absolutely',
       'abt', 'abuse', 'accent', 'accept'], dtype=object)

In [282]:
X_train_tf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## We can build Classifier using other algorithms e.g SVM

In [283]:
from sklearn.svm import SVC

In [284]:
clf_svc_tf_1 = OneVsRestClassifier(SVC()).fit(X_train_tf, y_train)
# clf_svc_tf = SVC()
# clf_svc_tf = OneVsRestClassifier(clf_svc_tf)

In [285]:
# clf_svc_tf.fit(X_train_tf, y_train)

In [286]:
predicted_labels_ttf = clf_svc_tf_1.predict(X_test_tf)
predicted_scores_tf = clf_svc_tf_1.decision_function(X_test_tf)

In [287]:
pred_inversed_tf = mlb.inverse_transform(predicted_labels_ttf)
y_test_inversed_tf = mlb.inverse_transform(y_test)

In [288]:
for i in range(5):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_test[i],
        ','.join(y_test_inversed_tf[i]),
        ','.join(pred_inversed_tf[i])
    ))

Title:	thanks stopping blog goal site place mostly vent corp bullshit tell stupid drunken stories host various ramblings peace eric
True labels:	Consulting,male
Predicted labels:	male


Title:	morning kinda forgot football today surpising since like every day except sunday next months something forgot football also forgot time football unbearable state affairs assure attempted mad dash stairs brought startlingly short fact legs want listen decided listen someone else someone completely unrelated also quite invisible told time would much better spent spasming legs want sorta quiver floor instead actually sprinting bodily mutany forced rely upon trusty much loyal arms transportation basement lair went well came door ever tried opening door top stairs without able use legs quite tricky end propped leaned hard managed stroke handle enough somehow pop open victory last one small problem football schedule fridge high fridge normally eye level towering frame eyes reduced midgethood ground sma

In [304]:
print('\033[1m\033[4mTF-IDF\033[4m\033[0m')
print_evaluation_scores(y_test, predicted_labels_ttf)

[1m[4mTF-IDF[4m[0m
Accuracy score:  0.3605
F1 score:  0.688231850117096
Average precision score:  0.517370245661824
Average recall score:  0.58775


   class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

### Insights on Comparing two vectorizer 

- On comparing both count vectorizer gave a good F1 score 71% againt Tf-idf which is 69% (for dataset of 10000 rows)

- Count Vectorizer perform well because of n-gram fitted into it, so model could learn more vocab.

- Logistics Regression outperformered well compared to SVM. Since SVM doesn’t support multiclass classification natively. we convert it one-to-rest, them=n we used svm on dataset.

- F1 score should be consider the most important matrix in multiclass label. 

# Part 2 - Chat bot

In [None]:
# !pip install tflearn

**Load our GL json file**

In [232]:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

import numpy
import tflearn
import tensorflow
import random

import json
with open('GL bot.json') as file:
    data = json.load(file)

#Now its time to take out the data we want from our JSON file. We need all of the patterns and
#which class/tag they belong to. We also want a list of all of the unique words in our patterns
words = []
labels = []
docs_x = []
docs_y = []

#Now its time to loop through our JSON data and extract the data we want. 
#For each pattern we will turn it into a list of words using nltk.word_tokenizer, rather than having them as strings. 
#We will then add each pattern into our docs_x list and its associated tag into the docs_y list

for intent in data['intents']:
    for pattern in intent['patterns']:
        wrds = nltk.word_tokenize(pattern)
        words.extend(wrds)
        docs_x.append(wrds)
        docs_y.append(intent["tag"])
        
    if intent['tag'] not in labels:
        labels.append(intent['tag'])

In [233]:
words = [stemmer.stem(w.lower()) for w in words if w != "?"]
words = sorted(list(set(words)))

labels = sorted(labels)

**Creat Bag of words and making train data**

In [234]:
training = []
output = []

out_empty = [0 for _ in range(len(labels))]

for x, doc in enumerate(docs_x):
    bag = []

    wrds = [stemmer.stem(w.lower()) for w in doc]

    for w in words:
        if w in wrds:
            bag.append(1)
        else:
            bag.append(0)

    output_row = out_empty[:]
    output_row[labels.index(docs_y[x])] = 1

    training.append(bag)
    output.append(output_row)
    
training = numpy.array(training)
output = numpy.array(output)

## Develop a model

In [235]:
tensorflow.compat.v1.reset_default_graph()
net = tflearn.input_data(shape=[None, len(training[0])])
net = tflearn.fully_connected(net, 32)
net = tflearn.fully_connected(net, 16)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(output[0]), activation="softmax")
net = tflearn.regression(net)

model = tflearn.DNN(net)

In [238]:
model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True)
model.save("model.tflearn")

Training Step: 22999  | total loss: [1m[32m0.44479[0m[0m | time: 0.056s
| Adam | epoch: 1000 | loss: 0.44479 - acc: 0.9704 -- iter: 176/184
Training Step: 23000  | total loss: [1m[32m0.40032[0m[0m | time: 0.058s
| Adam | epoch: 1000 | loss: 0.40032 - acc: 0.9733 -- iter: 184/184
--
INFO:tensorflow:C:\Users\AI_SG\Documents\Upendran\Course\python\Projects\NLP\W3 NLP Projects\DATA\model.tflearn is not in all_model_checkpoint_paths. Manually adding it.


In [239]:
try:
    model.load("model.tflearn")
except:
    model.fit(training, output, n_epoch=1000, batch_size=8, show_metric=True)
    model.save("model.tflearn")

INFO:tensorflow:Restoring parameters from C:\Users\AI_SG\Documents\Upendran\Course\python\Projects\NLP\W3 NLP Projects\DATA\model.tflearn


**Making Predictions:**

*Now its time to actually use the model! Ideally we want to generate a response to any sentence the user types in. To do this we need to remember that our model does not take string input, it takes a bag of words. We also need to realize that our model does not spit out sentences, it generates a list of probabilities for all of our classes. This makes the process to generate a response look like the following:*

    – Get some input from the user
    – Convert it to a bag of words
    – Get a prediction from the model
    – Find the most probable class
    – Pick a response from that class

In [244]:
def bag_of_words(s, words):
    bag = [0 for _ in range(len(words))]

    s_words = nltk.word_tokenize(s)
    s_words = [stemmer.stem(word.lower()) for word in s_words]

    for se in s_words:
        for i, w in enumerate(words):
            if w == se:
                bag[i] = 1
            
    return numpy.array(bag)


def chat():
    print("Hi! I'm Bot. Hope your day is good.May I know what you are looking for ?\n" "Note: type end to stop!")
    while True:
        inp = input("You: ")
        if inp.lower() == "end":
            break

        results = model.predict([bag_of_words(inp, words)])
        results_index = numpy.argmax(results)
        tag = labels[results_index]

        for tg in data["intents"]:
            if tg['tag'] == tag:
                responses = tg['responses']

        print(random.choice(responses))

chat()

Hi! I'm Bot. Hope your day is good.May I know what you are looking for ?
Note: type end to stop!
You: hai
Hi Welcome you to Great Learning Virtual Assistant! how can i help you ?
You: what is olympus
Usr this link to know more about olympus: https://olympus.mygreatlearning.com/courses/52238/pages/olympus-2-dot-0?module_item_id=1374482 
You: i need to learn about Machine learning
https://www.simplilearn.com/10-algorithms-machine-learning-engineers-need-to-know-article
You: what is KNN?
Kindly visit the link to know more:: https://en.wikipedia.org/wiki/Machine_learning 
You: What are my career opportunities in this AIML fields?
Transferring the request to your PM, Please hold on
You: my career path
Follow the link to know more: https://www.mygreatlearning.com/blog/
You: top free course in GL
Transferring the request to your PM, Please hold on
You: free course
You can able to access our free course here : https://www.mygreatlearning.com/academy/learn-for-free/courses
You: trending news in