<a href="https://colab.research.google.com/github/Rohan-1103/Data-Science/blob/main/NLP/word2vec_custom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('IMDB Dataset.csv', engine='python')

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df.drop_duplicates(inplace=True)

In [7]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

In [8]:
df['review'] = df['review'].apply(remove_tags)

In [9]:
df['review'] = df['review'].apply(lambda x:x.lower())

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
from nltk.corpus import stopwords

sw_list = stopwords.words('english')

df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [13]:
df['review']

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production. filming technique...
2,thought wonderful way spend time hot summer we...
3,basically there's family little boy (jake) thi...
4,"petter mattei's ""love time money"" visually stu..."
...,...
49995,thought movie right good job. creative origina...
49996,"bad plot, bad dialogue, bad acting, idiotic di..."
49997,catholic taught parochial elementary schools n...
49998,going disagree previous comment side maltin on...


In [15]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [16]:
import gensim

In [17]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [19]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [20]:
story = []
for doc in df['review']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        story.append(simple_preprocess(sent))


In [21]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2
)

In [22]:
model.build_vocab(story)

In [23]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(29412029, 30767745)

In [24]:
len(model.wv.index_to_key)

61843

In [30]:
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    if len(doc) == 0:
        # Return a zero vector if no words are in the vocabulary
        return np.zeros(model.vector_size)
    return np.mean(model.wv[doc], axis=0)

In [31]:
document_vector(df['review'].values[0])

array([ 7.79441744e-02, -3.58183831e-02, -6.63932860e-02, -3.10043156e-01,
        6.96203887e-01, -5.46606481e-01, -1.69470996e-01,  5.52173436e-01,
        3.68832409e-01, -3.44546884e-01, -3.22055131e-01, -4.76513237e-01,
       -2.91672587e-01,  1.05361201e-01,  2.24741727e-01,  7.26818889e-02,
       -2.41765469e-01, -3.15817475e-01, -1.48866564e-01, -1.69204757e-01,
        1.66974574e-01,  1.07697271e-01,  1.01987366e-02,  2.05293357e-01,
       -1.88812435e-01, -1.52904674e-01, -1.76732674e-01,  8.73027965e-02,
       -2.48854354e-01,  1.96705759e-02,  2.64609188e-01, -4.11420226e-01,
       -4.36449540e-04, -2.84448534e-01,  5.71990609e-02,  5.31864405e-01,
       -3.28645557e-01,  2.39176705e-01,  1.10737227e-01, -2.70607114e-01,
       -3.57454538e-01, -6.50834590e-02, -1.03958949e-01, -3.66203427e-01,
        1.55972362e-01,  1.44553602e-01,  2.20330626e-01, -4.68408614e-02,
       -2.56413847e-01,  2.02960312e-01,  1.67987168e-01,  7.33608287e-03,
       -1.71386510e-01, -

In [32]:
from tqdm import tqdm

## **Some important practical tips:**
1. **Use Ensemble techniques more:** Combining multiple models often leads to better predictive performance and robustness compared to using a single model. Ensemble methods like Random Forests, Gradient Boosting, or Stacking can capture diverse patterns in the data.
2. **Create domain specific Hueristic Features:** Leverage your understanding of the problem domain to engineer meaningful features. These features, derived from expert knowledge, can often provide significant predictive power that generic features might miss.
3. **Do not blindly use Deep Learning techniques/Directly start with deep learning:** While powerful, deep learning models require large datasets and significant computational resources. Start with simpler models and only move to deep learning if necessary, after exploring traditional machine learning approaches.
4. **Make sure dataset is not imbalanced:** An imbalanced dataset, where one class significantly outnumbers others, can lead to models that perform poorly on the minority class. Use techniques like oversampling, undersampling, or synthetic data generation (SMOTE) to address class imbalance.

In [33]:
X = []
for doc in tqdm(df['review'].values):
    X.append(document_vector(doc))

100%|██████████| 49582/49582 [48:22<00:00, 17.08it/s]


In [34]:
X = np.array(X)

In [35]:
X[0]

array([ 7.79441744e-02, -3.58183831e-02, -6.63932860e-02, -3.10043156e-01,
        6.96203887e-01, -5.46606481e-01, -1.69470996e-01,  5.52173436e-01,
        3.68832409e-01, -3.44546884e-01, -3.22055131e-01, -4.76513237e-01,
       -2.91672587e-01,  1.05361201e-01,  2.24741727e-01,  7.26818889e-02,
       -2.41765469e-01, -3.15817475e-01, -1.48866564e-01, -1.69204757e-01,
        1.66974574e-01,  1.07697271e-01,  1.01987366e-02,  2.05293357e-01,
       -1.88812435e-01, -1.52904674e-01, -1.76732674e-01,  8.73027965e-02,
       -2.48854354e-01,  1.96705759e-02,  2.64609188e-01, -4.11420226e-01,
       -4.36449540e-04, -2.84448534e-01,  5.71990609e-02,  5.31864405e-01,
       -3.28645557e-01,  2.39176705e-01,  1.10737227e-01, -2.70607114e-01,
       -3.57454538e-01, -6.50834590e-02, -1.03958949e-01, -3.66203427e-01,
        1.55972362e-01,  1.44553602e-01,  2.20330626e-01, -4.68408614e-02,
       -2.56413847e-01,  2.02960312e-01,  1.67987168e-01,  7.33608287e-03,
       -1.71386510e-01, -

In [36]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

y = encoder.fit_transform(df['sentiment'])

In [37]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [38]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [40]:
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.8152667137239085