<a href="https://colab.research.google.com/github/Shrutakeerti/Twitter-Post-Sentiment-Analysis/blob/main/untitled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from keras.models import Model
from keras.layers import Input, Dense

nltk.download('stopwords')
from nltk.corpus import stopwords

# Step 1: Data Collection and Preprocessing
# Dummy dataset
data = {
    'post': [
        'I love this product!',
        'This is the worst service ever.',
        'Amazing experience, will come again.',
        'I am very disappointed.',
        'Totally worth it!',
        'Never coming back here.'
    ],
    'reply': [
        'Me too, it’s fantastic!',
        'I agree, it’s terrible.',
        'Absolutely wonderful!',
        'Same here, very let down.',
        'Indeed, it’s great!',
        'Terrible experience.'
    ]
}

df = pd.DataFrame(data)

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text))
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d+', '', text)
    text = text.strip()
    return text

# Apply preprocessing
df['post'] = df['post'].apply(preprocess_text)
df['reply'] = df['reply'].apply(preprocess_text)

# Combine post and reply for analysis
df['post_reply'] = df['post'] + ' ' + df['reply']

# Step 2: Feature Extraction
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_features=5000)
X = vectorizer.fit_transform(df['post_reply']).toarray()

# Step 3: Autoencoder Model
input_dim = X.shape[1]
encoding_dim = 128

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation='relu')(input_layer)
decoder = Dense(input_dim, activation='sigmoid')(encoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

# Train the autoencoder
autoencoder.fit(X, X, epochs=50, batch_size=32, shuffle=True, validation_split=0.2)

# Extract the encoder part for dimensionality reduction
encoder_model = Model(inputs=input_layer, outputs=encoder)
encoded_X = encoder_model.predict(X)

# Step 4: Clustering
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(encoded_X)

# Assign clusters to the original data
df['cluster'] = clusters

# Step 5: Evaluation
silhouette_avg = silhouette_score(encoded_X, clusters)
print(f'Silhouette Score: {silhouette_avg}')

# Display the clustered data
print(df)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Silhouette Score: 0.11824134737253189
                                 post                    reply  \
0                 i love this product    me too it s fantastic   
1      this is the worst service ever    i agree it s terrible   
2  amazing experience will come again     absolutely wonderful   
3              i am very disappointed  same here very let down   
4                    totally worth it    

