In [None]:
'''
You may need to run this cell the first time that you work with the code provided here. 
Uncomment the following line and run it.
After the first time you run it, you should not have to run it again.
You may have to restart Jupyter after installing to get everything up and running
'''

#%pip install wordcloud

In [None]:
#Run this cellblock to import all the necessary libraries and packages for the code.
import re
import string
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
#from google.colab import files
import io

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

In [None]:
# ========================================================
# PART 0: CREATE FUNCTIONS TO UPLOAD AND LOAD TEXT DATA
# ========================================================

'''
No inputs necessary here, just run this code block.
'''

# Function to load a CSV file (works in any Python environment)
def load_csv(file_path=None):
    """
    Load a CSV file from a file path and return a pandas DataFrame
    
    Parameters:
    file_path (str, optional): Path to the CSV file. If None, will prompt user.
    
    Returns:
    DataFrame: The loaded pandas DataFrame
    """
    # Try to load the CSV from the file path
    try:
        df = pd.read_csv(file_path)
        print(f"Successfully loaded '{file_path}' with {df.shape[0]} rows and {df.shape[1]} columns.")
        
        # Display the first few rows to check the data
        print("\nPreview of your data:")
        print(df.head())
        
        # Display column names
        print("\nColumn names (you'll need these to select text columns):")
        for i, col in enumerate(df.columns):
            print(f"{i}: {col}")
            
        return df
    
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

# Function to extract text from a dataframe
def extract_text_from_df(df, column_name, rows=None):
    """
    Extract text from a specific column in the dataframe
    
    Parameters:
    df (DataFrame): The pandas DataFrame
    column_name (str): The name of the column containing text
    rows (list or None): List of row indices to include, or None for all rows
    
    Returns:
    str: Combined text from the specified column
    """
    if column_name not in df.columns:
        print(f"Error: Column '{column_name}' not found in the dataframe.")
        print(f"Available columns are: {', '.join(df.columns)}")
        return ""
    
    if rows is None:
        # Use all rows if not specified
        selected_data = df[column_name]
    else:
        # Use only specified rows
        selected_data = df.loc[rows, column_name]
    
    # Filter out nan values and convert to string
    text_data = [str(text) for text in selected_data if not pd.isna(text)]
    
    # Combine all text with newlines in between
    combined_text = "\n".join(text_data)
    
    print(f"Extracted {len(text_data)} text entries from column '{column_name}'")
    print(f"Total length: {len(combined_text)} characters")
    
    return combined_text

In [None]:
# ========================================================
# PART 0: YOU UPLOAD AND LOAD TEXT DATA
# ========================================================

'''
This is where you will upload the csv file with the tweets. 
If you are successful in uploading the data, you will see a message
    Sucessfully loaded '___.csv' with 3467 rows and two columns    
'''

# Replace with your file path or run without arguments for prompt
df = load_csv('???.csv')  


# Change '???' to the actual column name containing your text data
if df is not None:
    text_column_name = '???'
    sample_text = extract_text_from_df(df, text_column_name)

In [None]:
# ========================================================
# PART 1: TEXT PREPROCESSING FUNCTIONS
# ========================================================

'''
Read through the code here to understand what the inputs are to the function 
and what you as a user can change. Then run this code block. No inputs are required from you
here
'''

# Function to clean and preprocess text
def preprocess_text(text, lowercase=True, remove_punctuation=True, remove_numbers=True):
    """
    Clean and preprocess text by:
    - Converting to lowercase (optional)
    - Removing punctuation (optional)
    - Removing numbers (optional)
    """
    # Convert to lowercase
    if lowercase:
        text = text.lower()
    
    # Remove punctuation
    if remove_punctuation:
        text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove numbers
    if remove_numbers:
        text = re.sub(r'\d+', '', text)
    
    return text

In [None]:
# ========================================================
# PART 1: TEXT PREPROCESSING EXPLORATION
# ========================================================

'''
Work with the function below and explore the different parameters.
What does each of them appear to do to the output? How might this be helpful?
'''
preprocessed_text = preprocess_text(
    sample_text, 
    lowercase=True,            
    remove_punctuation=True,   
    remove_numbers=True        
)

print("--- Preprocessed Text ---")
print(preprocessed_text[:500] + "...")  # Print first 500 characters

In [1]:
# ========================================================
# PART 2: TOKENIZATION FUNCTIONS
# ========================================================

# Function to tokenize text into sentences and words
def tokenize_text(text):
    """
    Split text into sentences and words
    """
    # Tokenize into sentences
    sentences = sent_tokenize(text)
    
    # Tokenize into words
    words = word_tokenize(text)
    
    return sentences, words

In [None]:
# ========================================================
# PART 2: TOKENIZATION EXPLORATION
# ========================================================

'''
Here you will complete several different tasks to get an understanding of the tokenization function

    1. Return to part one (text preprocessing) and set all of the parameters to be true, run this code block
    2. Return to part one (text preprocessing) and play around with the parameters to determine how many total sentences
       there were in the tweets.
    
'''

sentences, words = tokenize_text(preprocessed_text)

print(f"\n--- Text Tokenization ---")
print(f"Number of sentences: {len(sentences)}")
print(f"Number of words: {len(words)}")
print(f"\nFirst 3 sentences:")
for i, sentence in enumerate(sentences[:3]):
    print(f"  {i+1}. {sentence}")

print(f"\nFirst 20 words:")
print(words[:20])

In [None]:
# ========================================================
# PART 3: STOPWORD REMOVAL FUNCTION
# ========================================================

'''
No inputs necessary here, just run this code block.
'''

def remove_stopwords(word_list, custom_stopwords=None):
    """
    Remove common stopwords from a list of words
    """
    # Get standard English stopwords
    stop_words = set(stopwords.words('english'))
    
    # Add custom stopwords if provided
    if custom_stopwords:
        stop_words.update(custom_stopwords)
    
    # Remove stopwords
    filtered_words = [word for word in word_list if word not in stop_words]
    
    return filtered_words

In [None]:
# ========================================================
# PART 3: STOPWORD REMOVAL EXPLORATION
# ========================================================

'''
Using your favorite search engine look up what a stopword is in NLP.

Once you understand what a stop word is run the code as it is written here.
How many words were removed?

Now add some of your own stop words. 
What words can you add to remove the largest number of words?
'''

custom_stopwords = ['', '', '' ]  #Add your stop words to this list here

filtered_words = remove_stopwords(words, custom_stopwords)

print(f"\n--- After Stopword Removal ---")
print(f"Original word count: {len(words)}")
print(f"Filtered word count: {len(filtered_words)}")
print(f"Removed {len(words) - len(filtered_words)} stopwords")
print(f"\nFirst 20 filtered words:")
print(filtered_words[:20])

In [None]:
# ========================================================
# PART 4: STEMMING AND LEMMATIZATION FUNCTION
# ========================================================

'''
No inputs necessary here, just run this code block.
'''

# Function to perform stemming and lemmatization
def stem_and_lemmatize(word_list):
    """
    Apply stemming and lemmatization to a list of words
    """
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    stemmed_words = [stemmer.stem(word) for word in word_list]
    lemmatized_words = [lemmatizer.lemmatize(word) for word in word_list]
    
    return stemmed_words, lemmatized_words

In [None]:
# ========================================================
# PART 4: STEMMING AND LEMMATIZATION EXPLORATION
# ========================================================

'''
Using your favorite search engine look up what stemming and lemmatization are in NLP.
Why are these used? 

Once you understand what these methods are, run the following code and look at the output.
What do you think the output tells us about tweets with regards to language complexity?
'''

stemmed_words, lemmatized_words = stem_and_lemmatize(filtered_words)

print(f"\n--- Stemming vs Lemmatization ---")
print(f"Original word count: {len(filtered_words)}")

# Show comparison for first 10 words
print(f"\nComparison of first 10 words:")
print(f"{'Original':<15} {'Stemmed':<15} {'Lemmatized':<15}")
print("-" * 45)
for i in range(10):
    if i < len(filtered_words):
        print(f"{filtered_words[i]:<15} {stemmed_words[i]:<15} {lemmatized_words[i]:<15}")

In [None]:
# ========================================================
# PART 5: FREQUENCY ANALYSIS FUNCTION
# ========================================================

'''
No inputs necessary here, just run this code block.
'''

# Function to analyze word frequency
def analyze_frequency(word_list, n=10):
    """
    Find the most common words and plot frequency distribution
    """
    # Get frequency distribution
    fdist = FreqDist(word_list)
    
    # Get most common words
    most_common = fdist.most_common(n)
    
    # Create lists for plotting
    words_mc = [word for word, count in most_common]
    counts_mc = [count for word, count in most_common]
    
    return fdist, words_mc, counts_mc

In [None]:
# ========================================================
# PART 5: FREQUENCY ANALYSIS EXPLORATION
# ========================================================

'''
Run this function and look at the output.

Based on what you see, head back to previous parts of the code to 
create a frequency list, bar graph and word cloud that most accurately 
reflect the tweets. 
'''

# Change the number to see more or fewer top words
top_n = 15  # Number of top words to display

fdist, top_words, top_counts = analyze_frequency(filtered_words, top_n)

print(f"\n--- Word Frequency Analysis ---")
print(f"Top {top_n} most frequent words:")
for word, count in zip(top_words, top_counts):
    print(f"  {word}: {count}")

# Plot frequency distribution
plt.figure(figsize=(12, 6))
plt.bar(top_words, top_counts)
plt.title(f'Top {top_n} Most Frequent Words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Create word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white', 
                     max_words=100, contour_width=3, contour_color='steelblue')
wordcloud.generate(' '.join(filtered_words))

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Frequent Terms')
plt.show()

In [None]:
# ========================================================
# PART 6: SENTIMENT ANALYSIS FUNCTION
# ========================================================

'''
No inputs necessary here, just run this code block.
'''

# Function to analyze sentiment
def analyze_sentiment(text, by_sentence=False):
    """
    Perform sentiment analysis on text using VADER
    If by_sentence is True, analyze each sentence separately
    """
    # Initialize sentiment analyzer
    sia = SentimentIntensityAnalyzer()
    
    if by_sentence:
        # Analyze each sentence
        sentences = sent_tokenize(text)
        results = []
        
        for sentence in sentences:
            sentiment = sia.polarity_scores(sentence)
            results.append((sentence, sentiment))
        
        return results
    else:
        # Analyze entire text
        sentiment = sia.polarity_scores(text)
        return sentiment

In [None]:
# ========================================================
# PART 6: SENTIMENT ANALYSIS EXPLORATION
# ========================================================

'''
Run this block of code as it is to see the output.

Now change analyze by sentence to True. What happens?
'''

analyze_by_sentence = False  

sentiment_results = analyze_sentiment(sample_text, analyze_by_sentence)

print(f"\n--- Sentiment Analysis ---")
if analyze_by_sentence:
    print("Sentiment analysis by sentence:")
    for i, (sentence, sentiment) in enumerate(sentiment_results[:5]):  # Show first 5 for brevity
        print(f"\nSentence {i+1}: {sentence}")
        print(f"  Negative: {sentiment['neg']:.3f}")
        print(f"  Neutral: {sentiment['neu']:.3f}")
        print(f"  Positive: {sentiment['pos']:.3f}")
        print(f"  Compound: {sentiment['compound']:.3f}")
    
    # Also show average compound score
    avg_compound = sum(s['compound'] for _, s in sentiment_results) / len(sentiment_results)
    print(f"\nAverage compound score: {avg_compound:.3f}")
else:
    print("Overall sentiment analysis:")
    print(f"  Negative: {sentiment_results['neg']:.3f}")
    print(f"  Neutral: {sentiment_results['neu']:.3f}")
    print(f"  Positive: {sentiment_results['pos']:.3f}")
    print(f"  Compound: {sentiment_results['compound']:.3f}")