### 1. Import modules

In [10]:
import os
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')    # For tokenization
nltk.download('averaged_perceptron_tagger')  # For POS tagging

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vanng\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vanng\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vanng\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vanng\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

### 2. Load data

In [6]:
train_path = "../datasets/train"
categories = ['neg', 'pos']
data = []

def is_git_lfs_file(file_path):
    """
    Check if a file is a Git LFS pointer
    """
    with open(file_path, "r", encoding="utf-8") as file:
        first_line = file.readline().strip()
        return first_line.startswith("version https://git-lfs.github.com")

for category in categories:
    category_path = os.path.join(train_path, category)
    category= 0 if category == "neg" else 1

    for filename in os.listdir(category_path):
        file_path = os.path.join(category_path, filename)

        if is_git_lfs_file(file_path): # skip Git LFS pointer files
            continue

        with open(file_path, "r", encoding='utf-8') as file:
            content = file.read().strip()
        
        data.append((content, category))

df = pd.DataFrame(data, columns=['content', 'category'])

df


Unnamed: 0,content,category
0,Airport '77 starts as a brand new luxury 747 p...,0
1,"I don't know who to blame, the timid writers o...",0
2,This film is one giant pant load. Paul Schrade...,0
3,"The plot for Descent, if it actually can be ca...",0
4,"""Ghost of Dragstrip Hollow"" appears to take pl...",0
...,...,...
1120,At first i didn't think that Ben Affleck could...,1
1121,What would you expect from a film titled 'Surv...,1
1122,This movie isn't as bad as I heard. It was enj...,1
1123,I laughed so hard during this movie my face hu...,1


### 3. Transform data


#### 3.1. Convert data from RAW to Tokens

In [7]:
def convert_tokens(rawtext, verbose):
    # 1. Tokenization
    pattern = r'\w+'
    tokenizer = RegexpTokenizer(pattern)
    token_words = tokenizer.tokenize(rawtext)
    if (verbose):
        print('Tokens:'+str(token_words[0:10]))
    
    # 2. Decapitalization
    decap_token_words = [word.lower() for word in token_words]
    if (verbose):
        print("Decapitalized tokens:" + str(decap_token_words[0:10]))
    
    # 3. Remove stop words
    stopwords_nltk_en = set(stopwords.words('english'))
    rmsw_token_words = ([word for word in token_words if word.lower() not in stopwords_nltk_en])
    if (verbose):
        print('Stopwords removed:' + str(rmsw_token_words[0:20]))
    
    # 4. Remove CAP words
    rmcap_token_words = []
    for word in rmsw_token_words:
        if word.isupper():
            rmcap_token_words.append(word.title())
        else:
            rmcap_token_words.append(word)
    if (verbose):
        print('CAPITALIZED removed:' + str(rmcap_token_words[0:20]))
    
    # 5. Remove salutation
    salutation = ['mr', 'mrs', 'ms', 'dr', 'phd', 'prof', 'rev']
    rmsalu_token_words = ([word for word in rmsw_token_words if word.lower() not in salutation])
    if (verbose):
        print('Salutation removed:' + str(rmsalu_token_words[0:20]))
    
    # 6. Define transfer tag function:
    def transfer_tag(treebank_tag):
        treebank_tag = treebank_tag.lower()
        if treebank_tag.startswith('j'):
            return "a"
        elif treebank_tag.startswith('v'):
            return "v"
        elif treebank_tag.startswith('n'):
            return 'n'
        elif treebank_tag.startswith('r'):
            return 'r'
        else:
            return 'n'
    
    # 7. Lemmatization
    wnl = WordNetLemmatizer()

    lemma_words = []
    for word, tag in nltk.pos_tag(rmsalu_token_words):
        firstletter = tag[0].lower()
        wtag = transfer_tag(firstletter)
        if not wtag:
            lemma_words.extend([word])
        else:
            lemma_words.extend([wnl.lemmatize(word, wtag)])
    if verbose:
        print('Lemma:' + str(lemma_words[0:10]))
    
    # 8. English words
    eng_words = [word for word in lemma_words if len(wn.synsets(word.lower())) > 1]

    # 9 Remove numbers
    rmnb_token_words = ([word for word in eng_words if not word.isdigit()])
    if (verbose):
        print('Number removed:' + str(rmnb_token_words[0:20]))
    
    return rmnb_token_words

In [12]:
df_tokenized = df.copy()
[n, d] = df_tokenized.shape
df_tokenized['tokens'] = ['']*n

for index, row in df_tokenized.iterrows():
    df_tokenized['tokens'].iloc[index] = convert_tokens(row['content'], verbose = False)

df_tokenized.head(10)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_tokenized['tokens'].iloc[index] = convert_tokens(row['content'], verbose = False)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a

Unnamed: 0,content,category,tokens
0,Airport '77 starts as a brand new luxury 747 p...,0,"[start, brand, new, luxury, plane, load, valua..."
1,"I don't know who to blame, the timid writers o...",0,"[know, blame, timid, writer, director, seem, o..."
2,This film is one giant pant load. Paul Schrade...,0,"[film, one, giant, pant, load, Paul, lose, bad..."
3,"The plot for Descent, if it actually can be ca...",0,"[plot, Descent, actually, call, plot, two, not..."
4,"""Ghost of Dragstrip Hollow"" appears to take pl...",0,"[Ghost, Hollow, appear, take, place, era, long..."
5,Summer season is here when the choices in the ...,0,"[Summer, season, choice, cinemas, limited, hot..."
6,Shame on Yash Raj films and Aditya Chopra who ...,0,"[Shame, film, seem, lose, intelligence, year, ..."
7,If this is a 2008 product from one of the bigg...,0,"[product, one, big, production, house, Indian,..."
8,"I had some expectation for the movie, since it...",0,"[expectation, nice, star, cast, return, duo, W..."
9,I had a lot of expectations from this movie an...,0,"[lot, expectation, Film, br, br, Jimmy, operat..."


#### 3.2. TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(norm=None)
list_contents = []
for index, row in df_tokenized.iterrows():
    list_contents.append(" ".join(row.tokens))

tfidf_matrix = tfidf_vectorizer.fit_transform(list_contents)
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=[tfidf_vectorizer.get_feature_names_out()])

df_tfidf.head(10)

Unnamed: 0,3d,aback,abandon,abandoned,abandoning,abbey,abduct,abducts,abide,ability,...,zen,zero,zest,zillion,zip,zodiac,zombie,zombies,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.870769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
