In [5]:
%%time
# Basic libraries
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud

# NLP libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist

# Text processing
import emoji
from textblob import TextBlob
import contractions

# Feature extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2, SelectKBest

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

print("✓ All libraries imported successfully!")

✓ All libraries imported successfully!
CPU times: user 45.3 ms, sys: 6.62 ms, total: 52 ms
Wall time: 56.5 ms


In [6]:
from pathlib import Path
NOTEBOOK_DIR  = Path().resolve()
BASE_DIR = NOTEBOOK_DIR.parent
DATASET_DIR = BASE_DIR /  "data" / 'TwitterSentimentAnalysisDataset' 
DATASET_FILE_PATH = DATASET_DIR / "raw" / 'twitter_training.csv'
DATASET_FILE_PATH

PosixPath('/workspaces/CivicSense/data/TwitterSentimentAnalysisDataset/raw/twitter_training.csv')

In [11]:
df = pd.read_csv(str(DATASET_FILE_PATH))
df.columns = ['tweet_id', 'entity', 'sentiment', 'tweet_content']
print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print("\nFirst few rows:")
df.sample(5)

Dataset loaded successfully!
Shape: (74681, 4)

First few rows:


Unnamed: 0,tweet_id,entity,sentiment,tweet_content
72235,11176,TomClancysGhostRecon,Neutral,Good morning and good night from Auroa!... @ U...
62622,5135,GrandTheftAuto(GTA),Irrelevant,@ G2A _ com ur site is trash. Search for GTA V...
52394,10600,RedDeadRedemption(RDR),Positive,I won the Star Raider achievement in Red Pill ...
10558,13016,Xbox(Xseries),Negative,A revolutionary body form factor for new gamin...
72432,8809,Nvidia,Neutral,Latest The GST Daily! paper.li / GKConsultants...


In [12]:
print("BASIC DATA UNDERSTANDING")
print("="*80)
# Dataset info
print("\nDataset Info:")
print(df.info())
# Statistical summary
print("\nStatistical Summary:")
print(df.describe(include='all'))
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())
# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")

BASIC DATA UNDERSTANDING

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   tweet_id       74681 non-null  int64 
 1   entity         74681 non-null  object
 2   sentiment      74681 non-null  object
 3   tweet_content  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB
None

Statistical Summary:
            tweet_id     entity sentiment tweet_content
count   74681.000000      74681     74681         73995
unique           NaN         32         4         69490
top              NaN  Microsoft  Negative              
freq             NaN       2400     22542           172
mean     6432.640149        NaN       NaN           NaN
std      3740.423819        NaN       NaN           NaN
min         1.000000        NaN       NaN           NaN
25%      3195.000000        NaN       NaN           NaN
50%      6422