In [None]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

## Read Data

In [None]:
# read data
column_names = ["sentiment", "title", "content"]
df = pd.read_csv('C:/Users/ROG/Desktop/train.csv', header=None, names=column_names)
df['title'].fillna("", inplace=True)
df['sentiment'] = df['sentiment'] - 1

## Data Cleaning and Preprocessing

In [None]:
def clean_text(text):
    # HTML
    text = re.sub(r'<.*?>', '', text)
    # lowercase
    text = text.lower()
    # remove punctuation and numbers
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\d+', '', text)
    return text

df['title_cleaned'] = df['title'].apply(clean_text)
df['content_cleaned'] = df['content'].apply(clean_text)
df[['title', 'title_cleaned', 'content', 'content_cleaned']].head()
# stop words
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

df['title_cleaned'] = df['title_cleaned'].apply(remove_stopwords)
df['content_cleaned'] = df['content_cleaned'].apply(remove_stopwords)
df[['title', 'title_cleaned', 'content', 'content_cleaned']].head()
# Stem Extraction
stemmer = PorterStemmer()
def stem_text(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

df['title_cleaned'] = df['title_cleaned'].apply(stem_text)
df['content_cleaned'] = df['content_cleaned'].apply(stem_text)

df[['title', 'title_cleaned', 'content', 'content_cleaned']].head()

## Data Description and Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter

# style setting
sns.set_style("whitegrid")

# 1. sentiment distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='sentiment')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.show()

# 2. length distrition
df['content_length'] = df['content'].apply(lambda x: len(x.split()))
plt.figure(figsize=(8, 5))
sns.histplot(df['content_length'], bins=50, color='blue')
plt.title('Distribution of Review Lengths')
plt.xlabel('Review Length (number of words)')
plt.ylabel('Number of Reviews')
plt.xlim(0, 200)  # Limiting to 200 for better visibility
plt.show()

# 3. common words
positive_words = ' '.join(df[df['sentiment'] == 1]['title_cleaned'])
negative_words = ' '.join(df[df['sentiment'] == 0]['title_cleaned'])
positive_word_freq = Counter(positive_words.split())
negative_word_freq = Counter(negative_words.split())
# top10
top10_positive_words = positive_word_freq.most_common(10)
top10_negative_words = negative_word_freq.most_common(10)

plt.figure(figsize=(14, 6))
# Positive Titles Word Frequency
plt.subplot(1, 2, 1)
sns.barplot(x=[item[1] for item in top10_positive_words], 
            y=[item[0] for item in top10_positive_words], palette="viridis")
plt.title('Top 10 Words in Positive Titles')
plt.xlabel('Frequency')

# Negative Titles Word Frequency
plt.subplot(1, 2, 2)
sns.barplot(x=[item[1] for item in top10_negative_words], 
            y=[item[0] for item in top10_negative_words], palette="viridis")
plt.title('Top 10 Words in Negative Titles')
plt.xlabel('Frequency')

plt.tight_layout()
plt.show()

# 4. word cloud
plt.figure(figsize=(12, 6))

# Positive Word Cloud
plt.subplot(1, 2, 1)
wordcloud = WordCloud(background_color="white", max_words=100).generate(positive_words)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Positive Reviews Word Cloud')

# Negative Word Cloud
plt.subplot(1, 2, 2)
wordcloud = WordCloud(background_color="white", max_words=100).generate(negative_words)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Negative Reviews Word Cloud')

plt.tight_layout()
plt.show()
