In [2]:
# conda install -c conda-forge/label/cf202003 spacy-model-en_core_web_md

In [3]:
# conda install -c conda-forge/label/cf202003 spacy

In [11]:
# !conda install -c anaconda nltk -y

In [4]:
from spacy.lang.en import English

### making text into token

In [5]:
nlp=English()
text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""
my_doc=nlp(text)
token_text=[]
for token in my_doc:
    token_text.append(token.text)
print(token_text)

['When', 'learning', 'data', 'science', ',', 'you', 'should', "n't", 'get', 'discouraged', '!', '\n', 'Challenges', 'and', 'setbacks', 'are', "n't", 'failures', ',', 'they', "'re", 'just', 'part', 'of', 'the', 'journey', '.', 'You', "'ve", 'got', 'this', '!']


### making text into sentence

In [6]:
nlp=English()
sbd=nlp.create_pipe('sentencizer')
nlp.add_pipe(sbd)
text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""
doc=nlp(text)
sents_list=[]
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)  

["When learning data science, you shouldn't get discouraged!", "\nChallenges and setbacks aren't failures, they're just part of the journey.", "You've got this!"]


### Removing stopwords from text

In [7]:
from spacy.lang.en.stop_words import stopword
filter_sentence=[]
text = """When learning data science, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""
doc=nlp(text)
for word in doc:
    if word.is_stop==False:
        filter_sentence.append(word)
print(filter_sentence)
print(word)

[learning, data, science, ,, discouraged, !, 
, Challenges, setbacks, failures, ,, journey, ., got, !]
!


### Lemmatization

In [9]:
import en_core_web_md
nlp = en_core_web_md.load()
lem = nlp("run runs running runner")
# finding lemma for each word
for word in lem:
    print(word.text,word.lemma_)

run run
runs run
running run
runner runner


### POS Tagging

In [10]:
import en_core_web_md
nlp = en_core_web_md.load()
docs = nlp(u"All is well that ends well.")

for word in docs:
    print(word.text,word.pos_)

All DET
is AUX
well ADV
that DET
ends VERB
well ADV
. PUNCT


### Loading the dataset

In [14]:
import pandas as pd
import re
import nltk
data=pd.read_csv('train.csv')

### performing operation on first 100 tweets

In [16]:
tweets=data.tweet[:100]

In [17]:
tweets.head().tolist()

[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

### Cleaning the data

In [18]:
tweets=tweets.str.lower() 
tweets=tweets.apply(lambda x:re.sub("[^a-z\s]","",x))
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))
tweets=tweets.apply(lambda x:" ".join(word for word in x.split() if word not in stopwords ))

In [19]:
tweets.head().tolist()

['user father dysfunctional selfish drags kids dysfunction run',
 'user user thanks lyft credit cant use cause dont offer wheelchair vans pdx disapointed getthanked',
 'bihday majesty',
 'model love u take u time ur',
 'factsguide society motivation']

### creating the data intotoken

In [20]:
import spacy
import en_core_web_md
import numpy as np
nlp=en_core_web_md.load()
document=nlp(tweets[0])
print('documents',document)
print('Token','')
for token in document:
    print(token.text)

documents user father dysfunctional selfish drags kids dysfunction run
Token 
user
father
dysfunctional
selfish
drags
kids
dysfunction
run


### Making vector for data

In [21]:
print(document)
document=nlp(tweets[0])
for token in document:
    print(token.text,token.vector.shape)

user father dysfunctional selfish drags kids dysfunction run
user (300,)
father (300,)
dysfunctional (300,)
selfish (300,)
drags (300,)
kids (300,)
dysfunction (300,)
run (300,)


### Now taking wholedata

In [23]:
tweets=data.tweet

In [25]:
tweets.head().tolist()

[' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run',
 "@user @user thanks for #lyft credit i can't use cause they don't offer wheelchair vans in pdx.    #disapointed #getthanked",
 '  bihday your majesty',
 '#model   i love u take with u all the time in urð\x9f\x93±!!! ð\x9f\x98\x99ð\x9f\x98\x8eð\x9f\x91\x84ð\x9f\x91\x85ð\x9f\x92¦ð\x9f\x92¦ð\x9f\x92¦  ',
 ' factsguide: society now    #motivation']

In [26]:
tweets=tweets.str.lower() 
tweets=tweets.apply(lambda x:re.sub("[^a-z\s]","",x))
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))
tweets=tweets.apply(lambda x:" ".join(word for word in x.split() if word not in stopwords ))

In [27]:
tweets.head().tolist()

['user father dysfunctional selfish drags kids dysfunction run',
 'user user thanks lyft credit cant use cause dont offer wheelchair vans pdx disapointed getthanked',
 'bihday majesty',
 'model love u take u time ur',
 'factsguide society motivation']

In [28]:
import spacy
import en_core_web_md
import numpy as np
nlp=en_core_web_md.load()
document=nlp(tweets[0])
print('documents',document)
print('Token','')
for token in document:
    print(token.text)

documents user father dysfunctional selfish drags kids dysfunction run
Token 
user
father
dysfunctional
selfish
drags
kids
dysfunction
run


In [29]:
document=nlp.pipe(tweets)
tweets_vector=np.array([tweet.vector for tweet in document])
print(tweets_vector.shape)

(31962, 300)


### Applying LogisticRegession model

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [31]:
x=tweets_vector
y=data.label

In [32]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,stratify=y,random_state=0)
model=LogisticRegression(C=0.1)
model.fit(x_train,y_train)



LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
y_pred=model.predict(x_train)

In [34]:
print('accuracy:',accuracy_score(y_train,y_pred))

accuracy: 0.9447548384213114


In [35]:
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     20804
           1       0.73      0.33      0.46      1569

    accuracy                           0.94     22373
   macro avg       0.84      0.66      0.71     22373
weighted avg       0.94      0.94      0.93     22373



In [36]:
y_pred_test=model.predict(x_test)

In [37]:
accuracy_score(y_test,y_pred_test)

0.9443111899050995

In [38]:
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      8916
           1       0.75      0.31      0.44       673

    accuracy                           0.94      9589
   macro avg       0.85      0.65      0.70      9589
weighted avg       0.94      0.94      0.93      9589

