In [18]:
import numpy as np
import pandas as pd
import gensim,nltk,string,os,zipfile
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from nltk.stem import WordNetLemmatizer,PorterStemmer,LancasterStemmer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report,confusion_matrix

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df['text']

0       Our Deeds are the Reason of this #earthquake M...
1                  Forest fire near La Ronge Sask. Canada
2       All residents asked to 'shelter in place' are ...
3       13,000 people receive #wildfires evacuation or...
4       Just got sent this photo from Ruby #Alaska as ...
                              ...                        
7608    Two giant cranes holding a bridge collapse int...
7609    @aria_ahrary @TheTawniest The out of control w...
7610    M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...
7611    Police investigating after an e-bike collided ...
7612    The Latest: More Homes Razed by Northern Calif...
Name: text, Length: 7613, dtype: object

In [4]:
[w for w in word_tokenize(df['text'].iloc[0]) if w not in stopwords.words('english')]

['Our', 'Deeds', 'Reason', '#', 'earthquake', 'May', 'ALLAH', 'Forgive', 'us']

In [5]:
data = []
def text_cleaning(x):
    for i in x['text']:
        data.append([w for w in word_tokenize(i.lower()) if w not in stopwords.words('english') and w not in string.punctuation]) 

In [6]:
text_cleaning(df)

In [7]:
data

[['deeds', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'us'],
 ['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada'],
 ['residents',
  'asked',
  "'shelter",
  'place',
  'notified',
  'officers',
  'evacuation',
  'shelter',
  'place',
  'orders',
  'expected'],
 ['13,000',
  'people',
  'receive',
  'wildfires',
  'evacuation',
  'orders',
  'california'],
 ['got',
  'sent',
  'photo',
  'ruby',
  'alaska',
  'smoke',
  'wildfires',
  'pours',
  'school'],
 ['rockyfire',
  'update',
  'california',
  'hwy',
  '20',
  'closed',
  'directions',
  'due',
  'lake',
  'county',
  'fire',
  'cafire',
  'wildfires'],
 ['flood',
  'disaster',
  'heavy',
  'rain',
  'causes',
  'flash',
  'flooding',
  'streets',
  'manitou',
  'colorado',
  'springs',
  'areas'],
 ["'m", 'top', 'hill', 'see', 'fire', 'woods', '...'],
 ["'s",
  'emergency',
  'evacuation',
  'happening',
  'building',
  'across',
  'street'],
 ["'m", 'afraid', 'tornado', 'coming', 'area', '...'],
 ['three', 'peop

# 1. Training Your Own Model - Word2Vec

In [8]:
trained = Word2Vec(data,min_count=2,sg=1,window=2)

In [12]:
trained.wv.similar_by_word('earthquake')

[('real', 0.9987149834632874),
 ('drowned', 0.9986874461174011),
 ('exploded', 0.9986623525619507),
 ('terrorist', 0.9986556172370911),
 ('end', 0.9986073970794678),
 ('area', 0.9985902905464172),
 ('4', 0.9985886812210083),
 ('great', 0.9985824227333069),
 ('lol', 0.9985775947570801),
 ('might', 0.9985739588737488)]

In [13]:
trained.wv.get_vector('earthquake')

array([-0.16769353,  0.0178898 ,  0.09899525, -0.0830802 , -0.11643381,
       -0.36784244,  0.12461206,  0.4808725 , -0.11617592, -0.2423674 ,
       -0.13377012, -0.14965281, -0.08578119,  0.11691414, -0.07421583,
       -0.1987719 ,  0.10240614, -0.11359329, -0.21820137, -0.4115698 ,
        0.11928599, -0.00362729, -0.00054583, -0.07060292, -0.00496722,
       -0.00809798, -0.05259161, -0.16835733, -0.09196518,  0.1325324 ,
        0.26731208, -0.03899104,  0.1660759 , -0.05590305,  0.05858663,
        0.01186336,  0.0686678 , -0.29383785,  0.10122518, -0.29155186,
       -0.1623201 , -0.10136265, -0.03324253,  0.10868829,  0.13907722,
       -0.16552728, -0.03279367, -0.00907618,  0.26236132,  0.1579492 ,
        0.11329678, -0.12990919, -0.16546679,  0.0033845 , -0.12808104,
        0.10872381,  0.11732896,  0.10231113,  0.03241859, -0.04672313,
        0.1170281 ,  0.01734778, -0.12817629,  0.08112647, -0.24921827,
        0.35466588, -0.07161692,  0.1396113 , -0.30108467,  0.34

# 2. Word2Vec - embeddings

In [14]:
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [15]:
w2v = api.load('word2vec-google-news-300')

In [16]:
w2v.similar_by_word('celebrate')

[('celebrating', 0.8155195116996765),
 ('celebrated', 0.7704004049301147),
 ('celebration', 0.7418112754821777),
 ('commemorate', 0.7094196081161499),
 ('celebrates', 0.6778473258018494),
 ('Celebrate', 0.6725128293037415),
 ('Celebrating', 0.6519656777381897),
 ('celebrations', 0.6102868914604187),
 ('cele_brate', 0.5931569337844849),
 ('rejoice', 0.5903198719024658)]

In [17]:
w2v.get_vector('celebrate')

array([-4.34570312e-02, -1.35742188e-01,  1.18164062e-01,  2.06054688e-01,
        2.33398438e-01, -3.18359375e-01,  7.56835938e-02, -2.65625000e-01,
        7.42187500e-02,  7.76367188e-02, -2.71484375e-01, -7.27539062e-02,
       -2.84423828e-02,  3.78906250e-01,  2.94921875e-01,  7.37304688e-02,
        1.25000000e-01,  2.22656250e-01,  2.39257812e-01, -1.33789062e-01,
        1.47460938e-01,  2.91015625e-01,  2.10937500e-01, -2.49023438e-01,
        1.76757812e-01, -5.06591797e-03,  1.71875000e-01,  4.17480469e-02,
        2.41210938e-01, -1.57226562e-01,  2.03125000e-01,  7.26318359e-03,
        3.08227539e-03,  1.12792969e-01, -1.34765625e-01, -1.23535156e-01,
        2.55859375e-01, -3.37890625e-01,  2.61718750e-01,  2.13867188e-01,
        2.73437500e-02, -7.56835938e-03,  2.77099609e-02,  9.86328125e-02,
        2.05078125e-01, -1.34765625e-01, -3.35937500e-01,  5.39550781e-02,
       -1.98974609e-02,  3.49609375e-01, -2.12402344e-02,  2.39257812e-01,
        2.46093750e-01,  

# 3. Loading GloVe vector

In [19]:
zipped_folder = zipfile.ZipFile('glove.6B.zip')

In [22]:
zipped_folder.filelist

[<ZipInfo filename='glove.6B.50d.txt' compress_type=deflate filemode='-rw-rw-r--' file_size=171350079 compress_size=69182485>,
 <ZipInfo filename='glove.6B.100d.txt' compress_type=deflate filemode='-rw-rw-r--' file_size=347116733 compress_size=134300389>,
 <ZipInfo filename='glove.6B.200d.txt' compress_type=deflate filemode='-rw-rw-r--' file_size=693432828 compress_size=264336891>,
 <ZipInfo filename='glove.6B.300d.txt' compress_type=deflate filemode='-rw-rw-r--' file_size=1037962819 compress_size=394362180>]

In [23]:
GloVe_data = zipped_folder.extract('glove.6B.50d.txt')

In [42]:
for i in open(GloVe_data, encoding="utf8"):
    print("Below is the word: ")
    print(i.split(" ")[0])
    print('\n')
    print("Below is the Vector: ")
    print(i.split(" ")[1:])
    
    break

Below is the word: 
the


Below is the Vector: 
['0.418', '0.24968', '-0.41242', '0.1217', '0.34527', '-0.044457', '-0.49688', '-0.17862', '-0.00066023', '-0.6566', '0.27843', '-0.14767', '-0.55677', '0.14658', '-0.0095095', '0.011658', '0.10204', '-0.12792', '-0.8443', '-0.12181', '-0.016801', '-0.33279', '-0.1552', '-0.23131', '-0.19181', '-1.8823', '-0.76746', '0.099051', '-0.42125', '-0.19526', '4.0071', '-0.18594', '-0.52287', '-0.31681', '0.00059213', '0.0074449', '0.17778', '-0.15897', '0.012041', '-0.054223', '-0.29871', '-0.15749', '-0.34758', '-0.045637', '-0.44251', '0.18785', '0.0027849', '-0.18411', '-0.11514', '-0.78581\n']


In [39]:
embedding_vectors = {}
for i in open(GloVe_data, encoding="utf8"):
    word = i.split(" ")[0]
    embedding = i.split(" ")[1:]
    embedding_vectors[word] = embedding

the
['0.418', '0.24968', '-0.41242', '0.1217', '0.34527', '-0.044457', '-0.49688', '-0.17862', '-0.00066023', '-0.6566', '0.27843', '-0.14767', '-0.55677', '0.14658', '-0.0095095', '0.011658', '0.10204', '-0.12792', '-0.8443', '-0.12181', '-0.016801', '-0.33279', '-0.1552', '-0.23131', '-0.19181', '-1.8823', '-0.76746', '0.099051', '-0.42125', '-0.19526', '4.0071', '-0.18594', '-0.52287', '-0.31681', '0.00059213', '0.0074449', '0.17778', '-0.15897', '0.012041', '-0.054223', '-0.29871', '-0.15749', '-0.34758', '-0.045637', '-0.44251', '0.18785', '0.0027849', '-0.18411', '-0.11514', '-0.78581\n']


In [31]:
embedding_vectors.get('celebrate')

['0.12191',
 '0.81221',
 '-1.0196',
 '-0.65983',
 '1.2592',
 '-1.1492',
 '-0.9591',
 '0.29959',
 '0.10093',
 '0.48487',
 '-0.1485',
 '-0.71627',
 '0.44745',
 '-0.11944',
 '0.79406',
 '-0.11197',
 '0.088971',
 '-0.59892',
 '-0.55688',
 '-0.10685',
 '0.48526',
 '0.010769',
 '-0.53309',
 '-0.04685',
 '0.54524',
 '-0.63864',
 '-1.1066',
 '-0.75267',
 '0.1718',
 '-0.54662',
 '1.6035',
 '1.9806',
 '-1.3012',
 '0.26014',
 '-0.4527',
 '-0.32813',
 '0.45034',
 '0.083061',
 '-0.91002',
 '0.32949',
 '-0.74186',
 '-0.376',
 '-0.94503',
 '-0.70876',
 '0.16447',
 '0.75677',
 '-0.25292',
 '-0.77441',
 '-0.65655',
 '-0.55295\n']

# 4. Simple Chatbot using TFIDF

In [43]:
df = pd.read_csv('Botdata.csv',encoding='cp1252')

In [44]:
df.Responses.value_counts(normalize=True)

Link: Neural Nets wiki                       0.143357
Link: Machine Learning wiki                  0.139860
Transferring the request to your PM          0.129371
I hope I was able to assist you, Good Bye    0.125874
Link: Olympus wiki                           0.122378
Hello! how can i help you ?                  0.115385
Please use respectful words                  0.115385
I am your virtual learning assistant         0.108392
Name: Responses, dtype: float64

In [45]:
df['Pattern'].iloc[7]

'listen'

In [46]:
WNL = WordNetLemmatizer()

In [47]:
[WNL.lemmatize(i) for i in word_tokenize(df['Pattern'].iloc[281])]

['Please', 'get', 'some', 'live', 'person', 'to', 'clear', 'my', 'doubt']

In [123]:
data = []
def clean_text(x):
    for i in x:
        data.append(' '.join([''.join(WNL.lemmatize(w,pos='v')) for w in word_tokenize(i.lower()) if w not in string.punctuation])) 

In [124]:
clean_text(df['Pattern'])

In [125]:
data

['hi',
 'how be you',
 'hello there',
 'hello',
 'whats up',
 'hey',
 'yo',
 'listen',
 'please help me',
 'hi there',
 'hello bot',
 'whats up for today',
 'hello guy i need a help',
 'hey there',
 'i have a quick question',
 'how to start',
 'online',
 'hey ya',
 'talk to you for first time',
 'anyone there',
 'i be here to get help',
 'someone help me please',
 'ello',
 'wassuppp',
 'whats happen around the portal',
 'i have few quick question',
 'i need a help',
 'there',
 'what be your name',
 'who be you',
 'how do they call you',
 'do i know you',
 'who be there',
 'who be you',
 'your name please',
 'may i know your name',
 'speak up',
 'be you a human',
 'do you answer question',
 'who be learn assistant',
 'who be support assistant',
 'any assistant',
 'whos that i be speak to',
 'may i know who you be',
 'who be speak',
 'be you a bot or a human',
 'who be i speak to a bot or a person',
 'how do i call you',
 'can i call you vla',
 'be you my teacher',
 'they call you with w

In [126]:
tf = TfidfVectorizer()

In [127]:
tfidf = tf.fit_transform(data)

In [128]:
tfidf.toarray().shape

(286, 291)

In [54]:
LE = LabelEncoder()

In [55]:
X = tfidf.toarray()
y = LE.fit_transform(df['Responses'])

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [65]:
RFC = RandomForestClassifier(n_estimators=200)

In [66]:
RFC.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200)

In [67]:
pred = RFC.predict(X_test)

In [68]:
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       0.50      0.55      0.52        11
           1       0.62      0.62      0.62         8
           2       0.27      0.75      0.40         4
           3       0.83      0.77      0.80        13
           4       0.73      0.42      0.53        19
           5       1.00      0.92      0.96        12
           6       0.44      1.00      0.62         4
           7       0.67      0.53      0.59        15

    accuracy                           0.64        86
   macro avg       0.63      0.70      0.63        86
weighted avg       0.70      0.64      0.65        86



In [143]:
def pred(x):
    data=[]
    data.append(' '.join([''.join(WNL.lemmatize(w,pos='v')) for w in word_tokenize(x.lower()) if w not in string.punctuation]))
    x = tf.transform(data)
    pred = RFC.predict(x.toarray())
    print(LE.inverse_transform(pred))

In [150]:
pred("What is your name")

['I am your virtual learning assistant']
