In [None]:

# 1. PROJECT OVERVIEW
# -----------------------------------------------------------
# This notebook performs NLP analysis on student feedback data.
# Goals:
# - Extract top themes from Wins, Losses, and Blockers
# - Perform sentiment analysis
# - Visualize trends
# - Provide actionable recommendations
# -----------------------------------------------------------

# -----------------------------------------------------------
# 2. IMPORT LIBRARIES
# -----------------------------------------------------------
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from textblob import TextBlob
import seaborn as sns

# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# -----------------------------------------------------------
# 3. LOAD AND INSPECT DATA
# -----------------------------------------------------------
# Load the CSV file
df = pd.read_csv("Copy of Umuzi XB1 Check in (Responses) - Form Responses 1.csv")

# Check columns
print(df.columns)

# Focus on relevant columns
win_col = "Share a win from the last week (what went well, something you enjoyed)"
loss_col = "Share a loss (something that was challenging or did not go well)"
blocker_col = "Share a blocker, if any (anything that stopped you from doing what you needed to do)"

# -----------------------------------------------------------
# 4. DATA CLEANING & PREPROCESSING
# -----------------------------------------------------------
# Handle missing values
df[win_col].fillna("", inplace=True)
df[loss_col].fillna("", inplace=True)
df[blocker_col].fillna("", inplace=True)

# -----------------------------------------------------------
# 5. TOKENIZATION, STOPWORD REMOVAL, LEMMATIZATION
# -----------------------------------------------------------
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

# Apply preprocessing
wins_tokens = [token for sentence in df[win_col] for token in preprocess(sentence)]
losses_tokens = [token for sentence in df[loss_col] for token in preprocess(sentence)]
blockers_tokens = [token for sentence in df[blocker_col] for token in preprocess(sentence)]

# -----------------------------------------------------------
# 6. FREQUENCY ANALYSIS (TOP THEMES)
# -----------------------------------------------------------
wins_top5 = Counter(wins_tokens).most_common(5)
losses_top5 = Counter(losses_tokens).most_common(5)
blockers_top5 = Counter(blockers_tokens).most_common(5)

print("Top 5 Wins Themes:", wins_top5)
print("Top 5 Losses Themes:", losses_top5)
print("Top 5 Blockers Themes:", blockers_top5)

# -----------------------------------------------------------
# 7. SENTIMENT ANALYSIS
# -----------------------------------------------------------
wins_sentiment = [TextBlob(str(text)).sentiment.polarity for text in df[win_col]]
losses_sentiment = [TextBlob(str(text)).sentiment.polarity for text in df[loss_col]]

wins_summary = {
    'positive': sum(1 for s in wins_sentiment if s > 0),
    'neutral': sum(1 for s in wins_sentiment if s == 0),
    'negative': sum(1 for s in wins_sentiment if s < 0)
}

losses_summary = {
    'positive': sum(1 for s in losses_sentiment if s > 0),
    'neutral': sum(1 for s in losses_sentiment if s == 0),
    'negative': sum(1 for s in losses_sentiment if s < 0)
}

print("Wins Sentiment Summary:", wins_summary)
print("Losses Sentiment Summary:", losses_summary)

# -----------------------------------------------------------
# 8. VISUALIZATIONS
# -----------------------------------------------------------
# Bar charts for top themes
sns.barplot(x=[w[0] for w in wins_top5], y=[w[1] for w in wins_top5])
plt.title("Top 5 Wins Themes")
plt.show()

sns.barplot(x=[l[0] for l in losses_top5], y=[l[1] for l in losses_top5])
plt.title("Top 5 Losses Themes")
plt.show()

sns.barplot(x=[b[0] for b in blockers_top5], y=[b[1] for b in blockers_top5])
plt.title("Top 5 Blockers Themes")
plt.show()

# Word Clouds
for name, tokens in [('Wins', wins_tokens), ('Losses', losses_tokens), ('Blockers', blockers_tokens)]:
    wc = WordCloud(width=800, height=400, background_color='white').generate(' '.join(tokens))
    plt.figure(figsize=(10,5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"{name} Word Cloud")
    plt.show()

# -----------------------------------------------------------
# 9. INSIGHTS & RECOMMENDATIONS
# -----------------------------------------------------------
print(\"\\nRecommendations:\")
print(\"1. Improve internet/data support for students (frequent blocker).\")
print(\"2. Offer time management workshops (common loss theme).\")
print(\"3. Provide financial assistance or guidance (blocker and loss theme).\")
print(\"4. Enhance clarity in instructions and resources (loss theme).\")
print(\"5. Continue motivational and career planning activities (win theme).\")

# -----------------------------------------------------------
# 10. EXPORT RESULTS
# -----------------------------------------------------------
# Save summary as JSON
import json
summary = {
    'wins_top5': wins_top5,
    'losses_top5': losses_top5,
    'blockers_top5': blockers_top5,
    'wins_sentiment': wins_summary,
    'losses_sentiment': losses_summary
}
with open('analysis_summary.json', 'w') as f:
    json.dump(summary, f, indent=4)
