In [None]:
pip install wordcloud
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import re
import string
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
from PIL import Image
from tqdm import tqdm
from collections import Counter
nltk.download('punkt')
nltk.download('stopwords')


df = pd.read_csv("Mental-Health-Twitter.csv", encoding='latin-1')
df.head()
df.shape
print ("Rows     : " ,df.shape[0])
print ("Columns  : " ,df.shape[1])
print ("\nFeatures : \n" ,df.columns.tolist())
print ("\nMissing values :  ", df.isnull().sum().values.sum())
print ("\nUnique values :  \n",df.nunique())
df.dtypes
print(len(df[df['followers'] < 1000]), 'users with less than 1000 followers')
print(len(df[df['followers'] > 1000]), 'userss with more than 1000 followers')
df[df['followers'] == df['followers'].max()]['user_id'].iloc[0]
top_follower = df[(df['user_id'] == 484109859
         )].reset_index(drop=True)
top_follower.head()
def remove_line_breaks(text):
    text = text.replace('\r', ' ').replace('\n', ' ')
    return text

#remove punctuation
def remove_punctuation(text):
    re_replacements = re.compile("__[A-Z]+__") 
    re_punctuation = re.compile("[%s]" % re.escape(string.punctuation))
    '''Escape all the characters in pattern except ASCII letters and numbers'''
    tokens = word_tokenize(text)
    tokens_zero_punctuation = []
    for token in tokens:
        if not re_replacements.match(token):
            token = re_punctuation.sub(" ", token)
        tokens_zero_punctuation.append(token)
    return ' '.join(tokens_zero_punctuation)

def remove_special_characters(text):
    text = re.sub('[^a-zA-z0-9\s]', '', text)
    return text

def lowercase(text):
    text_low = [token.lower() for token in word_tokenize(text)]
    return ' '.join(text_low)

def remove_stopwords(text):
    stop = set(stopwords.words('english'))
    word_tokens = nltk.word_tokenize(text)
    text = " ".join([word for word in word_tokens if word not in stop])
    return text

#remove one character words
def remove_one_character_words(text):
    '''Remove words from dataset that contain only 1 character'''
    text_high_use = [token for token in word_tokenize(text) if len(token)>1]      
    return ' '.join(text_high_use)   

# Stemming is a technique in natural language processing that reduces a word to its base or root form,
# which may not be a word by itself but can be used to identify the original word.
# Stemming with 'Snowball stemmer" package
def stem(text):
    stemmer = nltk.stem.snowball.SnowballStemmer('english')
    text_stemmed = [stemmer.stem(token) for token in word_tokenize(text)]        
    return ' '.join(text_stemmed)

def lemma(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    word_tokens = nltk.word_tokenize(text)
    text_lemma = " ".join([wordnet_lemmatizer.lemmatize(word) for word in word_tokens])       
    return ' '.join(text_lemma)

#break sentences to individual word list
def sentence_word(text):
    word_tokens = nltk.word_tokenize(text)
    return word_tokens
#break paragraphs to sentence token 
def paragraph_sentence(text):
    sent_token = nltk.sent_tokenize(text)
    return sent_token    


def tokenize(text):
    """Return a list of words in a text."""
    return re.findall(r'\w+', text)


def remove_numbers(text):
    no_nums = re.sub(r'\d+', '', text)
    return ''.join(no_nums)



def clean_text(text):
    _steps = [
    remove_line_breaks,
    remove_one_character_words,
    remove_special_characters,
    lowercase,
    remove_punctuation,
    remove_stopwords,
    stem,
    remove_numbers
]
    for step in _steps:
        text=step(text)
    return text 
df["post_text"] = df["post_text"].astype(str)
df["post_text"] = [x.replace(':',' ') for x in df["post_text"]]

df['clean_text'] = pd.Series([clean_text(i) for i in tqdm(df['post_text'])])
words = df["clean_text"].values
ls = []

for i in words:
    ls.append(str(i))
ls[:5]
# The wordcloud 
plt.figure(figsize=(16,13))
wc = WordCloud(background_color="lightblue", colormap='Set2', max_words=1000, max_font_size= 200,  width=1600, height=800)
wc.generate(" ".join(ls))
plt.title("Most discussed terms", fontsize=20)
plt.imshow(wc.recolor( colormap= 'Set2' , random_state=17), alpha=0.98, interpolation="bilinear", )
plt.axis('off')
all_spam_words = []
for sentence in df[df['label'] == 0]['post_text'].to_list():
    for word in sentence.split():
        all_spam_words.append(word)

df = pd.DataFrame(Counter(all_spam_words).most_common(25), columns= ['Word', 'Frequency'])

sns.set_context('notebook', font_scale= 1.3)
plt.figure(figsize=(18,8))
sns.barplot(y = df['Word'], x= df['Frequency'], palette= 'summer')
plt.title("Most Commonly Used Words")
plt.xlabel("Frequnecy")
plt.ylabel("Words")
plt.show()
df.head()
most_pop = df.sort_values('followers', ascending =False)[['user_id', 'followers']].head(12)
most_pop['followers1'] = most_pop['followers']/1000
plt.figure(figsize = (20,25))

sns.barplot(data = most_pop, y = 'user_id', x = 'followers1', color = 'c')
plt.xticks(fontsize=27, rotation=0)
plt.yticks(fontsize=30, rotation=0)
plt.xlabel('User followers in Thousands', fontsize = 21)
plt.ylabel('')
plt.title('Followers', fontsize = 30);
plt.figure(figsize = (20,25))

sns.barplot(data = most_pop, y = 'user_id', x = 'followers1', color = 'c')
plt.xticks(fontsize=27, rotation=0)
plt.yticks(fontsize=30, rotation=0)
plt.xlabel('User followers in Thousands', fontsize = 21)
plt.ylabel('')
plt.title('Followers', fontsize = 30);