In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import nltk

import re
import string
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

As of now only analysing Twitter data, will do for Reddit data later.

In [None]:
df = pd.read_csv('../input/twitter-and-reddit-sentimental-analysis-dataset/Twitter_Data.csv')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.category.value_counts()

So, there are 3 different values for category.
Lets encode these numeric value to categorical as folllows

* -1 to Negative,
* 0 to Neutral,
* 1 to Positive.

In [None]:
df['category']=df['category'].map({-1.0:'Negative', 0.0:'Neutral', 1.0:'Positive'})

In [None]:
# Rename the description column name
df['Tweet'] = df['clean_text']

In [None]:
df.drop('clean_text', axis=1, inplace=True)

In [None]:
df.head()

Now, Lets start the cleaning process.

The usual cleaning process in NLP involves:- 

* Remove missing value if any.
* Remove unwanted character like punctuations.
* Replace all the Uppercase to lowercase as machine treat them differently but we knw   meaning of 'cat' and 'CAT' is same.
* Remove type of words that follow a certain pattern like link, email, or username, these words does not contribute much in analysis and can be removed from description with he help of regular expression.
* Remove all the stopwords like pronoun, articles etc. these words occur in very huge number in any sentence but does not contribute much in NLP analysis and thus can be removed.
* At last Changing the verb form to its root form.
example :- root word for 'Playing' and 'Played' will be 'Play'



In [None]:
df.isna().sum()

There are 7 records missing in category and 4 records missing in Tweet.
Lets remove these missing records from the dataset.

In [None]:
df = df.dropna()

In [None]:
df.isna().sum()

In [None]:
punct = string.punctuation
punct

In [None]:
#alpha = [' ','a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

In [None]:
stopWords = stopwords.words('english')
stopWords

###  There are 2 ways to find the root word. 
1. Stemming  :- This is hardcoded alogirithm to remove suffix like 'ing', 's', 'es'..etc the resulting word may not be correct english word. This is a computationally faster than Lemmatizing.

2. Lemmatizing :- This alogorithm look for synonyms for the word and find appropriate root word for the given word. This is bit slower than Stemming.

In [None]:
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

We know 'goose' and 'geese' denote samething one word is singular and other pularal. but stem and lemmatize treat them differently. See below example

In [None]:
print(ps.stem('geese'))
print(ps.stem('goose'))

In [None]:
print(wn.lemmatize('geese'))
print(wn.lemmatize('goose'))

Lemmatize method can be use, when we have smaller dataset, as it will not take much time. but if we have very large dataset the using Lemmatization could be time expensive, in that case prefer to use Stem method.

Now, lets write a function to clean the data. 

In [None]:
def cleanData(text):
    
    # To convert the all uppercase to lowercase
    text = text.lower()
    
    # This is a reguglar expression to replace anything char that is not alphabet or numeric.
    text = re.sub(r"[^A-Za-z0-9]",' ', text)
    
    # The above regular expression itself will take care of punctuation, below is an alternative to remove only punctuation.
    text = ''.join([char for char in text if char not in punct])
    
    # This will remove the stopwords and lemmatize the remaining word to its root word.
    text = [wn.lemmatize(word) for word in text.split(' ') if ((word not in stopWords) & len(word)!=0)]
    
    return ' '.join(text)

In [None]:
df['Tweet'] = df['Tweet'].apply(cleanData) 

In [None]:
df.head()

In [None]:
df['Tweet'][0]

See, The Tweet is cleared.

Lets create a column with the word length of tweet and then analize it.

In [None]:
def find_len(txt):
    return len(txt.split())

In [None]:
df['Txt_len'] = [find_len(txt) for txt in df['Tweet']]

In [None]:
df.head()

In [None]:
df.groupby('category').count()

In [None]:
ax = sns.countplot(x='category', data=df)

In [None]:
ax = sns.boxplot(y='Txt_len', data=df)

We can most of the tweets are of length between 0 and 15 words

In [None]:
ax = sns.histplot(x = 'Txt_len', data=df)

In [None]:
df[df['Txt_len']>20].count()

There are only 227 Tweets of length more than 20 words.

In [None]:
majority_tweet = df[df['Txt_len']<10]

In [None]:
ax = sns.countplot(x = 'category', data = majority_tweet)

We can observe Majority of te tweets are positive or neutral.

In [None]:
## These are basically the tweets which were in other language and  does not had english char in original tweet, and got cleared up while pre-proscessing

Zero_len_tweet = df[df['Txt_len']==0]

In [None]:
ax = sns.countplot(x='category', data = Zero_len_tweet)

In [None]:
df[df['category']=='Positive']['Tweet']

In [None]:
# Lets plot cloud plot for each category.

In [None]:
#pos_tweet = ' '.join([word for word in df[df['category']=='Positive']['Tweet']])

In [None]:
def printWordCloud(x,cat):
    
    '''
    x : df
    cat: Category
    '''
    
    cat_tweet = ' '.join([word for word in x[x['category']==cat]['Tweet']])
    
    # Initialize wordcloud object
    wc = WordCloud(background_color='white', max_words=50, stopwords = STOPWORDS)

    # Generate and plot wordcloud
    plt.figure(figsize=(20,10))
    plt.imshow(wc.generate(cat_tweet))
    plt.title('{} Sentiment Words'.format(cat), fontsize=20)
    plt.axis('off')
    plt.show()


Now with the help of word cloud lets see which words are used more number of time in each categories.

In [None]:
printWordCloud(df,'Positive')

In [None]:
printWordCloud(df,'Negative')

In [None]:
printWordCloud(df,'Neutral')

In [None]:
df.shape

So, now we are almost done with Data cleaning and analysing, Now we need to convert the data into the format where Machine can read. i.e to convert the Tweet columns in numeric form. This is also called as Vectorization.

# There are 3 ways to do that.
1. Count vectorization
2. N-gram
3. Tfidf Vectorization. 

Here in this Notebook i will be using Tfidf (term frequency–inverse document frequency).

So, What this method does is, it create the columns for each word. and provide the wieght of each word used in a particular tweet (record.)

In [None]:
vector = TfidfVectorizer(sublinear_tf=True)
X = vector.fit_transform(df['Tweet'].values)

In [None]:
len(vector.get_feature_names())

So, We can say there are total 53309 unique words are available in combining all tweets. and this number of columns have been created. and for each of these words the weight will be assigned for each tweet.

In [None]:
X.shape[0]

In [None]:
X_col = vector.get_feature_names()

In [None]:
X_col[:20]

In [None]:
type(X)

In [None]:
from scipy.sparse import csr_matrix

In [None]:
train = pd.DataFrame.sparse.from_spmatrix(X, columns = X_col)

In [None]:
train.head()

In [None]:
train['Txt_len'] = df['Txt_len']

In [None]:
train.head()

In [None]:
train.shape

Yeah.....! That's it for now.

Any Feedback commnent will be very very appriciable...!

# Thank you..!