<h1>ML Assignment</h1>

<h2>Install</h2>

In [None]:
#!pip install matplotlib
#!pip install wordcloud
#!pip install scattertext
#!pip install textblob
#!pip install networkx

<h2>Import</h2>

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA 

In [2]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
import plotly.express as px

import collections

import string
import re

from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

from textblob import TextBlob
from scipy.stats import kde
import networkx as nx

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

nltk.download('wordnet')
# Ngrams allows to group words in common pairs or trigrams..etc
from nltk import ngrams
# We can use counter to count the objects
from collections import Counter

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tqz11\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
from datetime import datetime
from collections.abc import Iterable

<h2>Input data</h2>

In [5]:
header = ["target", "id", "date", "flag", "user", "text"]

# Read the CSV file with specified column names
df = pd.read_csv("X_dataset.csv", encoding="ISO-8859-1", names=header)

In [6]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
# selecting target and features
text = df["text"]


In [8]:
text.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1600000 entries, 0 to 1599999
Series name: text
Non-Null Count    Dtype 
--------------    ----- 
1600000 non-null  object
dtypes: object(1)
memory usage: 12.2+ MB


In [9]:
print(text.shape)

(1600000,)


In [10]:
text = text.str.lower()

In [11]:
text.head()

0    @switchfoot http://twitpic.com/2y1zl - awww, t...
1    is upset that he can't update his facebook by ...
2    @kenichan i dived many times for the ball. man...
3      my whole body feels itchy and like its on fire 
4    @nationwideclass no, it's not behaving at all....
Name: text, dtype: object

<h2>Remove Email</h2>

In [12]:
def cleaning_email(df):
    return re.sub('@[^\s]+', ' ', df)

In [13]:
text = text.apply(lambda x: cleaning_email(x))

In [14]:
text.head()

0      http://twitpic.com/2y1zl - awww, that's a bu...
1    is upset that he can't update his facebook by ...
2      i dived many times for the ball. managed to ...
3      my whole body feels itchy and like its on fire 
4      no, it's not behaving at all. i'm mad. why a...
Name: text, dtype: object

<h2>Remove URL</h2>

In [15]:
def cleaning_URLs(df):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',df)

In [16]:
text = text.apply(lambda x: cleaning_URLs(x))

In [17]:
text.head()

0        - awww, that's a bummer.  you shoulda got ...
1    is upset that he can't update his facebook by ...
2      i dived many times for the ball. managed to ...
3      my whole body feels itchy and like its on fire 
4      no, it's not behaving at all. i'm mad. why a...
Name: text, dtype: object

<h2>Punctuation</h2>

In [18]:
english_punctuations = string.punctuation
english_punctuations

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [19]:
def cleaning_punctuations(text):
    translator = str.maketrans('', '', english_punctuations)
    return text.translate(translator)

<h2>Remove Punctuation</h2>

In [20]:
text = text.apply(lambda text: cleaning_punctuations(text))

In [21]:
text.head()

0         awww thats a bummer  you shoulda got davi...
1    is upset that he cant update his facebook by t...
2      i dived many times for the ball managed to s...
3      my whole body feels itchy and like its on fire 
4      no its not behaving at all im mad why am i h...
Name: text, dtype: object

<h2>Remove Numbers</h2>

In [22]:
def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)

In [23]:
text = text.apply(lambda x: cleaning_numbers(x))

In [24]:
text.head()

0         awww thats a bummer  you shoulda got davi...
1    is upset that he cant update his facebook by t...
2      i dived many times for the ball managed to s...
3      my whole body feels itchy and like its on fire 
4      no its not behaving at all im mad why am i h...
Name: text, dtype: object

<h2>Sampel Stopwords</h2>

In [25]:
stopwords_list = stopwords.words('english')
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

<h2>Remove Stop Words</h2>

In [26]:
STOPWORDS = set(stopwords_list)
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

In [27]:
text = text.apply(lambda text: cleaning_stopwords(text))

In [28]:
text.head()

0    awww thats bummer shoulda got david carr third...
1    upset cant update facebook texting might cry r...
2    dived many times ball managed save rest go bounds
3                     whole body feels itchy like fire
4                             behaving im mad cant see
Name: text, dtype: object

<h2>Tokenization</h2>


In [29]:
tokenizer = RegexpTokenizer(r'\w+')
text = text.apply(tokenizer.tokenize)

In [30]:
text.head()

0    [awww, thats, bummer, shoulda, got, david, car...
1    [upset, cant, update, facebook, texting, might...
2    [dived, many, times, ball, managed, save, rest...
3              [whole, body, feels, itchy, like, fire]
4                       [behaving, im, mad, cant, see]
Name: text, dtype: object

In [31]:
normal_visual = text[:10]
normal_visual = [word for i in normal_visual for word in i]
normal_visual

['awww',
 'thats',
 'bummer',
 'shoulda',
 'got',
 'david',
 'carr',
 'third',
 'day',
 'upset',
 'cant',
 'update',
 'facebook',
 'texting',
 'might',
 'cry',
 'result',
 'school',
 'today',
 'also',
 'blah',
 'dived',
 'many',
 'times',
 'ball',
 'managed',
 'save',
 'rest',
 'go',
 'bounds',
 'whole',
 'body',
 'feels',
 'itchy',
 'like',
 'fire',
 'behaving',
 'im',
 'mad',
 'cant',
 'see',
 'whole',
 'crew',
 'need',
 'hug',
 'hey',
 'long',
 'time',
 'see',
 'yes',
 'rains',
 'bit',
 'bit',
 'lol',
 'im',
 'fine',
 'thanks',
 'hows',
 'nope',
 'didnt',
 'que',
 'muera']

<h2>Lemmatizer</h2>

In [32]:
def lemmatizer_on_text(words):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]


In [33]:
text = text.apply(lambda x: lemmatizer_on_text(x))

In [34]:
text.head()

0    [awww, thats, bummer, shoulda, got, david, car...
1    [upset, cant, update, facebook, texting, might...
2    [dived, many, time, ball, managed, save, rest,...
3               [whole, body, feel, itchy, like, fire]
4                       [behaving, im, mad, cant, see]
Name: text, dtype: object

# Function to remove nouns using POS tagging

In [35]:
# Function to remove nouns using POS tagging
def remove_nouns(tokenized_words):
    tagged_words = nltk.pos_tag(tokenized_words)
    words_without_nouns = [word for word, tag in tagged_words if tag != 'NN' and tag != 'NNS']
    return words_without_nouns



In [36]:
 
# storing the current time in the variable
c = datetime.now()
# Displays Time
current_time = c.strftime('%H:%M:%S')
print('Current Time is:', current_time)

# Remove nouns and recombine words
df['Without_Nouns_data_normal'] = text.apply(lambda y: remove_nouns(y))
df['Recombined_data_normal'] = df['Without_Nouns_data_normal'].apply(lambda y: ' '.join(y) if isinstance(y, Iterable) else '')

# storing the current time in the variable
c = datetime.now()
# Displays Time
current_time = c.strftime('%H:%M:%S')
print('Current Time is:', current_time)

Current Time is: 15:50:36


KeyboardInterrupt: 

In [None]:
text_pos = df['Recombined_data_normal']
text_pos.head(1000)

# Random Sampling

In [None]:
sample_size=10000

# Perform simple random sampling
samplepos = text_pos.sample(n=sample_size, random_state=42)  # Set random_state for reproducibility


# Define the interval for systematic sampling
interval = len(normal) // sample_size

# Choose a random start within the interval
start = np.random.randint(0, interval)

# Perform systematic sampling
normal_RS = normal.iloc[start::interval]



In [None]:
df = normal_RS["text"]
print

# Text convert

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df_series)

In [None]:
print(X)

## Applying K Means Clustering

In [None]:
# Apply K-means clustering
num_clusters = 2  # You can choose the number of clusters based on domain knowledge or using techniques like the elbow method
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(X)


In [None]:
# Get cluster labels
cluster_labels = kmeans.labels_

In [None]:
YN = input("Do you want to show Cluster 1 or Cluster 2 (one/two):")


if YN == "one":
    # Analyze clusters
    for i in range(num_clusters):
        cluster_normal_clr = [normal_clr[j] for j in range(len(normal_clr)) if cluster_labels[j] == i]
        print(f"Cluster {i + 1}:")
        if i == 0:
            for text in cluster_normal_clr:
                print(text)
            print()
        else:
            print("NO")
elif YN == "two":
    # Analyze clusters
    for i in range(num_clusters):
        cluster_normal_clr = [normal_clr[j] for j in range(len(normal_clr)) if cluster_labels[j] == i]
        print(f"Cluster {i + 1}:")
        if i == 1:
            for text in cluster_normal_clr:
                print(text)
            print()
        else:
            print("NO")
else:
    print("ERROR")

In [None]:
# storing the current time in the variable
c = datetime.now()
# Displays Time
current_time = c.strftime('%H:%M:%S')
print('Current Time is:', current_time)



# Evaluate clustering quality (optional)
silhouette_avg = silhouette_score(X, cluster_labels)
print("Silhouette Score:", silhouette_avg)



# storing the current time in the variable
c = datetime.now()
# Displays Time
current_time = c.strftime('%H:%M:%S')
print('Current Time is:', current_time)



In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD

# Apply dimensionality reduction for visualization (optional)
svd = TruncatedSVD(n_components=2)
X_reduced = svd.fit_transform(X)

# Plot clusters
for i in range(num_clusters):
    cluster_texts = [normal_clr[j] for j in range(len(normal_clr)) if cluster_labels[j] == i]
    cluster_x = X_reduced[cluster_labels == i, 0]
    cluster_y = X_reduced[cluster_labels == i, 1]
    plt.scatter(cluster_x, cluster_y, label=f'Cluster {i + 1}')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('K-means Clustering of Text Data')
plt.legend()
plt.show()

# Plot centroids
centroids_reduced = svd.transform(kmeans.cluster_centers_)
plt.scatter(centroids_reduced[:, 0], centroids_reduced[:, 1], marker='x', s=100, c='red', label='Centroids')
plt.legend()
plt.show()


# Agglomerative Hierarchical Clustering Algorithm

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering

In [None]:

# Number of clusters
num_clusters = 2

# Create an instance of AgglomerativeClustering
hierarchical_cluster = AgglomerativeClustering(n_clusters=num_clusters)

# Fit the model and predict the labels
labels = hierarchical_cluster.fit_predict(X_reduced)

# Plot clusters
for i in range(num_clusters):
    cluster_texts = [normal_clr[j] for j in range(len(normal_clr)) if cluster_labels[j] == i]
    cluster_x = X_reduced[cluster_labels == i, 0]
    cluster_y = X_reduced[cluster_labels == i, 1]
    plt.scatter(cluster_x, cluster_y, label=f'Cluster {i + 1}')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('K-means Clustering of Text Data')
plt.legend()
plt.show()

# Plot centroids
centroids_reduced = svd.transform(kmeans.cluster_centers_)
plt.scatter(centroids_reduced[:, 0], centroids_reduced[:, 1], marker='x', s=100, c='red', label='Centroids')
plt.legend()
plt.show()

<h2>Word Cloud</h2>

In [None]:
# Combine the text for the Word Cloud
wordcloud_text = data.str.cat(sep=' ')
# Size of Word Cloud # (max_font_size = 100, max_words = 50,)
plt.rcParams["figure.figsize"] = (15,15)
# This use to import image and apply to the Word Cloud
custom_mask = np.array(Image.open('twitter_wordcloud.png'))
# Make Wordcloud
wordcloud = WordCloud(background_color = "white", colormap = 'plasma', mask = custom_mask).generate(wordcloud_text)

# Plot Wordcloud
plt.plot()
plt.imshow(wordcloud, interpolation = "bilinear")
plt.axis("off")
plt.show()

In [None]:
res = collections.Counter(wordcloud_text.split())
res_mostcommon = res.most_common(10)
res_mostcommon

In [None]:
values, counts = zip(*res_mostcommon)
values2,counts2 = zip(*res.most_common(100))

In [None]:
import plotly.graph_objects as go
import pandas as pd

# Create a radar chart
fig = go.Figure()
fig.add_trace(go.Scatterpolar(
      r=counts,
      theta=values,
      fill='toself',
      name='Word'
))
# Add title
fig.update_layout(title='Radar Chart', width=800, height=800)
# Show the plot
fig.show()

In [None]:


plt.bar(values, counts, color ='maroon', 
        width = 0.4)

plt.show()

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=values,
        y=counts
    ))

fig.add_trace(
    go.Bar(
        x=values,
        y=counts
    ))

fig.show()

In [None]:
# Calculate sentiment polarity for each comment
data_sentiment = data.apply(lambda x: TextBlob(x).sentiment.polarity)



# Assuming data_sentiment is a
# Create the KDE object
data_density = kde.gaussian_kde(data_sentiment)

# Generate points for the density curve
x = np.linspace(data_sentiment.min(), data_sentiment.max(), 100)
density = data_density(x)

plt.plot(x, density, label='Sentiment Distribution')
plt.xlabel('Sentiment Polarity')
plt.ylabel('Density')
plt.title('Sentiment Distribution in data_sentiment')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Generate bigrams from the 'text' column of the DataFrame
bigrams = list(ngrams(data2, 2))
bigram_counts = Counter(bigrams)

# Get the top 5 most common bigrams
top_5_bigrams = bigram_counts.most_common(5)

# Create a new graph
G = nx.Graph()

# Add edges with weights for all bigrams
for bigram, count in bigram_counts.items():
    G.add_edge(bigram[0], bigram[1], weight=count)

# Extract just the bigrams from the top 10 for highlighting
top_bigrams = [bigram for bigram, count in top_5_bigrams]

# Set node sizes based on whether the bigram is in the top 5
node_sizes = [1000 if node in top_bigrams else 100 for node in G.nodes()]

# Explicitly create a figure and axes object
plt.figure(figsize=(12, 8))  # Optional: specify the size of the figure
ax = plt.gca()  # Get the current axes

# Draw the graph with highlighted nodes for the top 5 bigrams
pos = nx.spring_layout(G)  # Positioning of nodes
nx.draw(G, pos, ax=ax, node_color='lightblue', node_size=node_sizes, with_labels=True)

# Finally, display the plot
plt.show()

