# **Youtube Comment Sentiment Analysis**

### 1. Importing the required libraries

In [1]:
#Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
#Download NLTK Data (run once)
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prath\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prath\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### 2. Importing the excel file

In [3]:
import os
import pandas as pd

# Load the uploaded CSV file
csv_path = os.environ.get("CSV_PATH", "test.csv")
df = pd.read_csv(csv_path)

# Normalize column names
df.columns = df.columns.str.strip().str.lower()

column_mapping = {
    'country': 'Country Code',
    'country code': 'Country Code',
    'views': 'Views',
    'video likes': 'Video Likes Added',
    'video likes added': 'Video Likes Added',
    'avg': 'Average Watch Time',
    'average watch time': 'Average Watch Time',
    'watch time': 'Average Watch Time',
    'comments': 'User Comments Added',
    'user comments': 'User Comments Added',
    'user comments added': 'User Comments Added',
    'is subscribed': 'Is Subscribed',
    'subscription status': 'Is Subscribed',
    'subscriptions added': 'User Subscriptions Added',
    'user subscriptions added': 'User Subscriptions Added',
    'text': 'Comment',
    'comment': 'Comment',
    'sentiments': 'Sentiment',
    'sentiment': 'Sentiment'
}

df.columns = [column_mapping.get(col.lower(), col) for col in df.columns]


In [4]:
df.columns

Index(['name', 'Country Code', 'Is Subscribed', 'video like', 'video dislike',
       'user subscription', 'Average Watch Time', 'Comment', 'Views'],
      dtype='object')

### 3. Data Cleaning 

In [5]:
df.head()
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54999 entries, 0 to 54998
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                54999 non-null  object 
 1   Country Code        54615 non-null  object 
 2   Is Subscribed       54999 non-null  bool   
 3   video like          54999 non-null  int64  
 4   video dislike       54999 non-null  int64  
 5   user subscription   54999 non-null  bool   
 6   Average Watch Time  53571 non-null  float64
 7   Comment             54999 non-null  object 
 8   Views               54999 non-null  int64  
dtypes: bool(2), float64(1), int64(3), object(3)
memory usage: 3.0+ MB


Unnamed: 0,video like,video dislike,Average Watch Time,Views
count,54999.0,54999.0,53571.0,54999.0
mean,0.498427,0.501573,206.205709,1.0
std,0.500002,0.500002,206.704726,0.0
min,0.0,0.0,0.0,1.0
25%,0.0,0.0,87.449071,1.0
50%,0.0,1.0,170.8552,1.0
75%,1.0,1.0,260.282291,1.0
max,1.0,1.0,5027.66,1.0


In [6]:
df.isnull().sum()

name                     0
Country Code           384
Is Subscribed            0
video like               0
video dislike            0
user subscription        0
Average Watch Time    1428
Comment                  0
Views                    0
dtype: int64

In [7]:
df.shape

(54999, 9)

In [8]:
df.columns

Index(['name', 'Country Code', 'Is Subscribed', 'video like', 'video dislike',
       'user subscription', 'Average Watch Time', 'Comment', 'Views'],
      dtype='object')

In [9]:
df.isnull().sum()
df = df.dropna()

In [10]:
df.isnull().sum()

name                  0
Country Code          0
Is Subscribed         0
video like            0
video dislike         0
user subscription     0
Average Watch Time    0
Comment               0
Views                 0
dtype: int64

In [11]:
for i in df :
    print(f"unique values in the columns:{i} \n\n {df[i].unique()}\n\n")

unique values in the columns:name 

 ['Ibrahim Choi' 'Yusuf Costa' 'Noah Costa' ... 'Darryl Christensen'
 'Allison Floyd' 'Michelle Patterson']


unique values in the columns:Country Code 

 ['HK' 'ME' 'RW' 'US' 'DE' 'ZZ' 'AR' 'BB' 'KZ' 'NP' 'AL' 'UA' 'HU' 'MM'
 'SE' 'EC' 'MA' 'GB' 'SD' 'MN' 'BR' 'KH' 'CH' 'RO' 'BH' 'ES' 'RU' 'CZ'
 'CL' 'CR' 'IL' 'BA' 'GE' 'SI' 'AT' 'PH' 'KE' 'UG' 'PT' 'KR' 'GH' 'SN'
 'HR' 'QA' 'JM' 'FI' 'LK' 'EG' 'PE' 'KY' 'AE' 'AU' 'BE' 'MY' 'KW' 'NO'
 'ZM' 'ZW' 'OM' 'TT' 'UY' 'LA' 'SA' 'CO' 'TR' 'PK' 'LU' 'ZA' 'NG' 'IT'
 'FR' 'NZ' 'CA' 'DZ' 'BW' 'BO' 'BD' 'ID' 'TN' 'GM' 'JP' 'TW' 'SV' 'TL'
 'MU' 'IR' 'VE' 'LT' 'AZ' 'CN' 'DJ' 'PY' 'SY' 'UZ' 'IN' 'BG' 'GR' 'MX'
 'CD' 'PR' 'SL' 'GT' 'IQ' 'EE' 'MK' 'LB' 'NI' 'PA' 'TH' 'AM' 'DK' 'SG'
 'BN' 'RS' 'BY' 'SK' 'CM' 'ET' 'VN' 'TZ' 'CY' 'PL' 'JO' 'IE' 'DO' 'MD'
 'SO' 'AF' 'MT' 'HN' 'NL' 'KG' 'PS' 'BI' 'IS' 'LY' 'BM' 'AW' 'AD' 'MV'
 'GD' 'SC' 'MC' 'LV' 'VG' 'MG' 'IM' 'CV' 'HT' 'MZ' 'ML' 'FJ' 'BT' 'SR'
 'TJ' 'YE' 'MO' 'LC' 'MQ' 'G

In [None]:
# Cleaning function for comments
def clean_comment(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE) 
    text = re.sub(r'\@w+|\#', '', text) 
    text = re.sub(r'[^\w\s]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
#Filter valid comments
df = df.dropna(subset=['Comment']) 
df['Comment'] = df['Comment'].apply(clean_comment)

### 4. Data Preprocessing

In [14]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = text.split()
    filtered = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(filtered)

df['Cleaned_Comment'] = df['Comment'].apply(preprocess_text)

In [15]:
# Sentiment labeling using TextBlob
from textblob import TextBlob

def get_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

df['Sentiment'] = df['Cleaned_Comment'].apply(get_sentiment)

In [16]:
# Feature and Target
X = df['Cleaned_Comment']
y = df['Sentiment']

In [17]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(X)

In [18]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
log_preds = log_model.predict(X_test)

In [19]:
log_model

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [20]:
# Evaluation
print("\nLogistic Regression Performance")
print(classification_report(y_test, log_preds))
print("Accuracy:", accuracy_score(y_test, log_preds))


Logistic Regression Performance
              precision    recall  f1-score   support

    Negative       0.90      0.64      0.75       802
     Neutral       0.93      0.98      0.95      6607
    Positive       0.97      0.93      0.95      3266

    accuracy                           0.94     10675
   macro avg       0.93      0.85      0.88     10675
weighted avg       0.94      0.94      0.94     10675

Accuracy: 0.9397658079625293


### Top 3 Countries by View Count

In [21]:
import plotly.express as px

# Top 3 Countries by View Count 
top_countries = df.groupby('Country Code')['Views'].sum().sort_values(ascending=False).head(3).reset_index()

fig1 = px.bar(
    top_countries,
    x='Country Code',
    y='Views',
    color='Country Code',
    text='Views',
    title='Top 3 Countries by View Count',
    color_discrete_sequence=['#FF6E54', '#DD5182', '#955196']  
)

fig1.update_traces(
    hovertemplate='<b>%{x}</b><br>Views: %{y}',
    textposition='outside'
)

fig1.update_layout(
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    font=dict(color='white'),
    title_font=dict(color='white', size=20),
    height=400
)

fig1.show()


### Sentiment Distribution

In [22]:
import plotly.express as px

#  Sentiment Pie Chart 
fig2 = px.pie(
    df,
    names='Sentiment',
    title=' Sentiment Distribution',
    color='Sentiment',
    color_discrete_map={
        'Positive': '#FF6E54', 
        'Neutral': '#DD5182',
        'Negative': '#955196'
    },
    hole=0.3
)

fig2.update_traces(
    textinfo='percent+label',
    hoverinfo='label+percent+value',
    textfont_color='white',
    pull=[0.05, 0.05, 0.05]  
)

fig2.update_layout(
    showlegend=True,
    title_font=dict(size=20, family='Arial', color='white'),
    font=dict(size=14, color='white'),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    height=500
)

fig2.show()


### Top 10 most repeated comments

In [23]:
import plotly.express as px
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Cleaned_Comment'])

# Nearest Neighbors
nn = NearestNeighbors(n_neighbors=6, metric='cosine')
nn.fit(tfidf_matrix)
distances, indices = nn.kneighbors(tfidf_matrix)

threshold = 0.2
comment_counter = Counter()

for i, (d_row, idx_row) in enumerate(zip(distances, indices)):
    for dist, j in zip(d_row[1:], idx_row[1:]):
        if dist < threshold:
            comment_counter[df['Comment'].iloc[i]] += 1
            comment_counter[df['Comment'].iloc[j]] += 1

comment_data = [(comment, freq) for comment, freq in comment_counter.most_common()]
comment_df = pd.DataFrame(comment_data, columns=['Comment', 'Rating']).head(10)

fig3 = px.bar(
    comment_df,
    x='Rating',
    y='Comment',
    orientation='h',
    text='Rating',
    color='Rating',
    color_continuous_scale='Agsunset',
    title=' Top 10 Most Repeated/Similar Comments (Interactive)'
)

fig3.update_traces(textposition='outside', hovertemplate='<b>%{y}</b><br>Rating: %{x}')
fig3.update_layout(
    yaxis=dict(title=''),
    xaxis=dict(title='Rating'),
    # plot_bgcolor='#1E1E1E',
    # paper_bgcolor='#1E1E1E',
    plot_bgcolor='rgba(0,0,0,0)',   
    paper_bgcolor='rgba(0,0,0,0)',  
    font=dict(color='white', size=14),
    title_font=dict(size=20),
    height=500
)

fig3.show()


### Subscribed or not subscribed to the channel

In [24]:
import plotly.express as px

# Prepare data
subscribed_data = df['Is Subscribed'].value_counts().reset_index()
subscribed_data.columns = ['Subscription Status', 'Count']
subscribed_data['Subscription Status'] = subscribed_data['Subscription Status'].map({True: 'Subscribed', False: 'Not Subscribed'})

subscribed_data['Code'] = subscribed_data['Subscription Status'].map({'Subscribed': 1, 'Not Subscribed': 0})

fig4 = px.bar(
    subscribed_data,
    x='Subscription Status',
    y='Count',
    text='Count',
    color='Code',
    color_continuous_scale='Agsunset',
    title=' Subscribed vs Not Subscribed (Styled)'
)

# Layout Styling
fig4.update_traces(
    textposition='outside',
    hovertemplate='<b>%{x}</b><br>Count: %{y}<extra></extra>'
)

fig4.update_layout(
    plot_bgcolor='rgba(0,0,0,0)',   
    paper_bgcolor='rgba(0,0,0,0)',  
    font=dict(color='white', size=14),
    title_font=dict(size=20),
    xaxis_title='',
    yaxis_title='Number of Users',
    height=450,
    coloraxis_showscale=False  
)

fig4.show()


In [25]:
import plotly.express as px
import pandas as pd
import pycountry
import random

def code_to_country_name(code):
    try:
        return pycountry.countries.get(alpha_2=code).name
    except:
        return None


df['Country Name'] = df['Country Code'].apply(code_to_country_name)

df = df.dropna(subset=['Country Name'])
df_subs = df[df['Is Subscribed'] == True]

sub_counts = df_subs['Country Name'].value_counts().to_dict()

subscriber_dots = []
for country, count in sub_counts.items():
    for _ in range(max(1, count // 5)):  
        subscriber_dots.append({'Country': country})

dots_df = pd.DataFrame(subscriber_dots)

if dots_df.empty:
    print("⚠️ No subscriber data found to plot.")
else:
    fig5 = px.scatter_geo(
        dots_df,
        locations='Country',
        locationmode='country names',
        projection='natural earth',
        title=' Subscribers by Country (1 Dot = ~5 Subscribers)',
        color='Country',  
    )

    fig5.update_traces(marker=dict(size=3, opacity=0.6))  

fig5.update_layout(
    title_x=0.01,
    geo=dict(
        showframe=False,
        showcoastlines=True,
        coastlinecolor="white",
        showland=True,
        landcolor="rgba(255, 255, 255, 0)",
        oceancolor="rgba(255, 255, 255, 0)",
        showocean=True,
        projection_type="natural earth",
        bgcolor='rgba(0,0,0,0)'
    ),
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    legend=dict(
        bgcolor='rgba(0,0,0,0)',
        bordercolor='rgba(0,0,0,0)',
        font=dict(color='white')  
    )
)

top_3 = sorted(sub_counts.items(), key=lambda x: x[1], reverse=True)[:3]
top_text = "🏆 Top 3 Subscribers : <br>" + "<br>".join([f"{c}: {n}" for c, n in top_3])

fig5.add_annotation(
    text=top_text,
    x=0.01,           
    y=1,           
    xref="paper",
    yref="paper",
    showarrow=False,
    align="left",
    font=dict(size=14, color="white"),
    bgcolor="rgba(0,0,0,0)",
    borderpad=6
)


fig5.show()