In [11]:
"""
Step 0: Import the necessary dependencies and download NLTK resources
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from contractions import CONTRACTION_MAP
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm  # Use SVM for classification
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
try:
    nltk.download('punkt_tab')
except:
    print("punkt_tab not available, continuing without it")

np.random.seed(42)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yiningxiang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yiningxiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/yiningxiang/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [12]:
"""
Step 1: Load the training data and explore its structure
"""
train_data = pd.read_csv('ReviewsTraining.csv')
test_data = pd.read_csv('ReviewsTest.csv')


class DataLoader:
    def __init__(self):
        # Create a set of stopwords
        self.stop_words = set(stopwords.words('english'))
        # Remove negation words from stopwords as they're important for sentiment
        # TODO: 只保留这些不够吧
        self.negation_words = {'no', 'not', 'nor', 'none', 'never', 'neither', 'hardly', 'barely'}
        self.stop_words = self.stop_words - self.negation_words

    def explore_data(self, data):
        """
        This function explores the data structure and prints the first few rows
        :param data: The input data
        :return: None
        """
        # Set the display options for pandas to show all columns and rows
        pd.set_option('display.max_columns', None)
        pd.set_option('display.max_rows', None)
        pd.set_option('display.max_colwidth', None)
        pd.set_option('display.width', 2000)
        pd.set_option('display.expand_frame_repr', False)

        # Check the structure
        print("Data shape:", data.shape)
        print("\nColumns:", data.columns.tolist())
        print("\nSample data:")
        print(data.head())

        # Check for missing values
        print("\nMissing values:")
        print(data.isnull().sum())

    def plot_score_distribution(self, data):
        """
        This function plots the distribution of review scores
        :param data: The input data
        :return: None
        """
        plt.figure(figsize=(10, 6))
        ax = sns.countplot(x='Score', data=data)
        plt.title('Distribution of Review Scores')
        plt.xlabel('Score')
        plt.ylabel('Count')

        # Add count labels on top of bars
        for p in ax.patches:
            ax.annotate(f'{p.get_height()}',
                        (p.get_x() + p.get_width() / 2., p.get_height()),
                        ha='center', va='baseline',
                        xytext=(0, 5), textcoords='offset points')

        plt.tight_layout()
        plt.show()

# # Uncomment the following lines to explore the training data
# # Create an instance of the DataLoader class
# data_loader = DataLoader()
# data_loader.explore_data(train_data) # Explore the training data
# data_loader.plot_score_distribution(train_data) # Plot the distribution of review scores

In [14]:
"""
Step 2: Text Preprocessing
    2.1 Expand contractions
    2.2 Text cleaning: convert to lowercase, remove special characters
    2.3 Advanced feature extraction
"""
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    """
    This function expands contractions in the text
    :param text: The input text
    :param contraction_mapping: The contraction mapping
    :return: The text with expanded contractions
    @example: 'don't' -> 'do not', 'can't' -> 'cannot'
    """
    if isinstance(text, str):
        contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                          flags=re.IGNORECASE | re.DOTALL)

        def expand_match(contraction):
            match = contraction.group(0)
            first_char = match[0]
            expanded_contraction = contraction_mapping.get(match) \
                if contraction_mapping.get(match) \
                else contraction_mapping.get(match.lower())
            expanded_contraction = first_char + expanded_contraction[1:] if expanded_contraction else match
            return expanded_contraction

        expanded_text = contractions_pattern.sub(expand_match, text)
        return expanded_text
    else:
        return ""


# Uncomment the following lines to test expand_contractions
example_text = "I can't believe it's not butter!"
print(f"Original: {example_text}")
print(f"Expanded: {expand_contractions(example_text)}")
for line in test_data['Text']:
    print(f"Original: {line}")
    print(f"Expanded: {expand_contractions(line)}")
    print()

Original: I can't believe it's not butter!
Expanded: I cannot believe it is not butter!
Original: I really wish my toddler liked this. It seems to be very high quality and it would provide more variety in his diet. Every so often he'll eat it, but it usually needs to be mixed with something else.  I think it tastes fine, but it wasn't a hit. I bought a case so I keep trying.
Expanded: I really wish my toddler liked this. It seems to be very high quality and it would provide more variety in his diet. Every so often he will eat it, but it usually needs to be mixed with something else.  I think it tastes fine, but it was not a hit. I bought a case so I keep trying.

Original: I (and my little guy) have loved almost all other happy baby products, but this was revolting.  On the up side, my dog loves it . . .
Expanded: I (and my little guy) have loved almost all other happy baby products, but this was revolting.  On the up side, my dog loves it . . .

Original: I wanted this for the omega 3

In [None]:
# TODO: text preprocessing