In [None]:
# Install required libraries
!pip install pandas nltk scikit-learn transformers torch textblob -q

# Import libraries
import os
import pandas as pd
import re
import nltk
import glob
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from textblob import TextBlob
from transformers import pipeline
from google.colab import files

# Download necessary NLTK data to prevent errors
# This specifically fixes the "Resource punkt_tab not found" error
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True) # Added to fix the LookupError

print("‚úÖ All libraries are installed and imported successfully.")

In [None]:
# --- Week 1: Data Collection & Input ---
import os
import glob

# 1. Define the path to your Kaggle input data
# In Kaggle, data is typically in '../input/dataset-name/'
# We'll search all subdirectories of ../input/ for .txt files
kaggle_input_dir = '../input/'
print(f"Searching for .txt files in {kaggle_input_dir}...")

# 2. Find all .txt files in the input directory and its subdirectories
# The recursive=True flag searches all subfolders
file_paths = glob.glob(os.path.join(kaggle_input_dir, '**', '*.txt'), recursive=True)

if not file_paths:
    print("  - ‚ö†Ô∏è No .txt files found in the '../input/' directory.")
    print("  - Please use the '+ Add data' button in Kaggle to add your .txt file(s) as a dataset.")
else:
    for f_path in file_paths:
        print(f"  - Found file: '{f_path}'")

# 3. Define the function to read plain text files (this is unchanged)
def read_text_file(filepath):
    """Reads content from a plain .txt file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
        return content
    except Exception as e:
        print(f"Error reading file {filepath}: {e}")
        return None

print(f"\n‚úÖ Week 1 complete. {len(file_paths)} file(s) are ready to be processed.")

In [None]:
# --- Week 2: Data Preprocessing ---

def preprocess_text(text):
    """Cleans, normalizes, and tokenizes text for modeling."""
    text = re.sub(r'[^a-zA-Z\\s]', '', text, re.I|re.A)
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return " ".join(lemmatized_tokens)

print("‚úÖ Week 2 functions defined (preprocess_text).")

In [None]:
# --- Week 3: Topic Modeling (LDA Only) ---

def get_topics_lda(text, n_topics=5, n_words=7):
    """Extracts topics using LDA and returns them as a string AND a list."""
    report_string = "--- üî¨ Key Themes (LDA) ---\n"
    documents = [text]
    vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english')
    tfidf = vectorizer.fit_transform(documents)

    if tfidf.shape[1] < n_topics:
        return report_string + "Warning: Text is too short for meaningful topic modeling.\n", []

    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(tfidf)
    feature_names = vectorizer.get_feature_names_out()

    topics_list = []  # To store topics for the recommendation engine
    for topic_idx, topic_words in enumerate(lda.components_):
        top_words = [feature_names[i] for i in topic_words.argsort()[:-n_words - 1:-1]]
        report_string += f"Topic {topic_idx + 1}: {', '.join(top_words)}\n"
        topics_list.append(top_words) # Add the topic words to our list

    return report_string, topics_list # Return both the report and the list

print("‚úÖ Week 3 functions defined (LDA only).")



In [None]:
# --- Week 4: Sentiment Analysis ---

def get_sentiment(text):
    """Performs sentiment analysis and returns the score and a report string."""
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity

    if polarity > 0.1: sentiment = 'Positive üòä'
    elif polarity < -0.1: sentiment = 'Negative üò†'
    else: sentiment = 'Neutral üòê'

    report_string = f"--- üìä Sentiment Analysis ---\n"
    report_string += f"Overall Sentiment: {sentiment}\n"
    report_string += f"Polarity Score: {polarity:.2f} (Range: -1.0 to +1.0)\n"

    return polarity, report_string

print("‚úÖ Week 4 functions defined (get_sentiment).")

In [None]:
# --- Week 5: Summarization & Insights Generation ---

def generate_summary(text):
    """Generates an abstractive summary and returns it as a string."""
    report_string = "--- ‚ú® Abstractive Summary ---\n"
    try:
        # Use device=-1 to force CPU, which is more stable on some Colab instances
        summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=-1)
        # Truncate text to avoid model length limits
        summary = summarizer(text[:1024], max_length=150, min_length=30, do_sample=False)
        report_string += summary[0]['summary_text'] + "\n"
    except Exception as e:
        report_string += f"Could not generate summary: {e}\n"
    return report_string

def generate_recommendations(polarity, topics):
    """Generates simple actionable insights based on analysis results."""
    report_string = "--- üí° Actionable Insights & Recommendations ---\n"

    if not topics: # Check if topics list is empty
        report_string += "Recommendation: No specific topics were identified for recommendations.\n"
        return report_string

    # Get the first 3 words of the most important topic
    top_topic_words = topics[0][:3]

    if polarity < -0.1:
        report_string += f"Recommendation: The overall sentiment is negative. It is advised to investigate feedback related to the key topics identified, such as '{', '.join(top_topic_words)}'.\\n"
    elif polarity > 0.1:
        report_string += f"Recommendation: The positive sentiment is strong. Consider leveraging this momentum by promoting aspects related to topics like '{', '.join(top_topic_words)}'.\\n"
    else:
        report_string += "Recommendation: The sentiment is neutral. Further analysis may be needed to understand specific opinions.\n"
    return report_string

print("‚úÖ Week 5 functions defined (summarization and recommendations).")

In [None]:
# --- Weeks 6-7: Visualization & Reporting ---

# Import the libraries needed for plotting
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from wordcloud import WordCloud

def generate_visualizations(text, polarity_score):
    """Generates and displays visualizations (graphs)."""
    print("\n" + "--- üñºÔ∏è Visualizations ---")

    # 1. Word Cloud (from your PDF)
    try:
        wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis').generate(text)
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.show()
    except ValueError:
        print("Could not generate word cloud (text might be too short).")

    # 2. Sentiment Distribution Chart (from your PDF)
    color = 'green' if polarity_score > 0.1 else ('red' if polarity_score < -0.1 else 'grey')
    fig = go.Figure(go.Bar(x=[polarity_score], y=['Sentiment'], orientation='h', marker=dict(color=color)))
    fig.update_layout(title='Overall Sentiment Polarity Score', xaxis_title='Score (-1.0 to +1.0)', xaxis=dict(range=[-1, 1]))
    fig.show()

print("‚úÖ Weeks 6-7 functions defined (generate_visualizations).")


In [None]:
# --- Pipeline Execution (Weeks 1-7) ---

for path in file_paths:
    print(f"\n\n{'='*40}\nüöÄ PROCESSING FILE: {os.path.basename(path)}\n{'='*40}")

    if not os.path.exists(path):
        print(f"‚ùå Error: File not found at {path}. Skipping.")
        continue

    # --- Week 1: Read Data ---
    clean_text = read_text_file(path)
    if not clean_text:
        print(f"File {path} is empty or could not be read. Skipping.")
        continue

    print("--- Preview of Cleaned Text (first 300 chars) ---")
    print(clean_text[:300] + "...")

    # --- Week 2: Preprocess Data ---
    processed_text_for_model = preprocess_text(clean_text)

    # --- Week 3: Topic Modeling (LDA Only) ---
    lda_report, topics_list = get_topics_lda(processed_text_for_model)

    # --- Week 4: Sentiment Analysis ---
    polarity, sentiment_report = get_sentiment(clean_text)

    # --- Week 5: Summarization & Insights ---
    summary_report = generate_summary(clean_text)
    recommendation_report = generate_recommendations(polarity, topics_list)

    # --- Print all text reports ---
    print("\n" + lda_report)
    print(sentiment_report)
    print(summary_report)
    print(recommendation_report)

    # --- Weeks 6-7: Generate Graphs ---
    generate_visualizations(processed_text_for_model, polarity)

print("\n\n‚úÖ All files processed up to Week 7.")