# Build a clustering model on text data. 

### Data Loading and Exploration

##### Data Loading

In [None]:
import pandas as pd

# Define column names
columns = ['polarity', 'id', 'date', 'query', 'user', 'text']


# Load the  dataset with specified encoding
df = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin-1', names=columns)




##### Data Exploration

In [None]:
# Explore the dataset

df.head()


In [None]:
df.info()

### Text Preprocessing

In [None]:
#Clean the tweet text by removing special characters, URLs, and unnecessary symbols.
#Tokenize the text into individual words or subword units

In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Clean tweet text
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|@\S+|[^A-Za-z0-9]+", " ", text)
    text = text.lower()
    return text

df['cleaned_text'] = df['text'].apply(clean_text)




In [None]:
# Tokenization (using NLTK)
nltk.download('punkt')
df['tokens'] = df['cleaned_text'].apply(nltk.word_tokenize)



In [None]:
# Remove stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df['filtered_tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

### Word Embeddings (TF-IDF)

In [None]:
#Convert the tokenized text into dense vector representations (embeddings).
#Use the Term Frequency-Inverse Document Frequency (TF-IDF) method.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(df['cleaned_text'])
tfidf_matrix 

# Now tfidf_matrix contains the TF-IDF vectors for each tweet


### Aggregating Text Vectors

In [None]:
#For each tweet, compute the average or sum of the TF-IDF vectors to get a single vector representation for the entire tweet.

In [None]:
# Calculate average TF-IDF vector for each tweet
average_tfidf_vector = tfidf_matrix.mean(axis=1)
df['tfidf_vector'] = average_tfidf_vector


### Clustering (K-Means)

In [None]:
#Set the number of clusters (K) based on domain knowledge or use techniques like the elbow method.
#Train the K-Means model on the aggregated tweet vectors.

In [None]:

import numpy as np
# Assuming df['tfidf_vector'] contains the TF-IDF vectors
tfidf_vectors = np.array(df['tfidf_vector'].tolist())  # Convert the column to a numpy array

# Reshape the array to have two dimensions
tfidf_vectors_2d = tfidf_vectors.reshape(-1, 1)  # Adjust the shape as per your data

# Now, fit the KMeans model with the reshaped data
kmeans.fit(tfidf_vectors_2d)

# Assign cluster labels to each tweet
df['cluster_label'] = kmeans.labels_




In [None]:
# Explore the resulting clusters
for cluster_id in range(5):
    print(f"Cluster {cluster_id}:")
    print(df[df['cluster_label'] == cluster_id]['text'].sample(5))
    print("\n")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Sentiment analysis (example using rule-based approach)
def get_sentiment(text):
    # Implement your sentiment analysis logic here
    # For simplicity, let's assume positive if the word "happy" appears, negative if "sad" appears
    if "happy" in text.lower():
        return "Positive"
    elif "sad" in text.lower():
        return "Negative"
    else:
        return "Neutral"

df['sentiment'] = df['text'].apply(get_sentiment)


# Print the sentiment for each tweet
for index, row in df.iterrows():
    print(f"Tweet {index + 1}: {row['text']} - Sentiment: {row['sentiment']}")

In [None]:
# Plotting
plt.figure(figsize=(8, 6))
sns.barplot(x=list(sentiment_counts.keys()), y=list(sentiment_counts.values()))
plt.title('Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

print("Total Positive Tweets:", positive_count)
print("Total Negative Tweets:", negative_count)
print("Total Neutral Tweets:", neutral_count)
