In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("~/Downloads/train.csv")
test = pd.read_csv("~/Downloads/test.csv")

combined = pd.concat([train, test], ignore_index = True)

In [3]:
combined.shape, train.shape, test.shape

((84456, 2), (59119, 2), (25337, 2))

In [4]:
combined.head()

Unnamed: 0,Label,Content
0,2.0,second counting input 5 2 which receives inter...
1,4.0,extremely low temperature of the chips in cold...
2,3.0,of the basic ammonium salt of the carboxyl ate...
3,9.0,18 u2033 is provided which is axially supporte...
4,2.0,to an u201c inner surface u201d means the surf...


In [11]:
# Lets apply the function....
import re
def clean_text(text):
    text = re.sub("(www.+)|(\s+)|(@[a-zA-Z]+)|\W+", " ", text) # removes hyperlinks, special chars
    text = re.sub('(\w+:/\S+)', " ", text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub("http|https", " ", text)
    text = re.sub("[^a-zA-Z0-9]+", " ", text)
    return(text)

In [12]:
combined["Clean_Context"] = combined.Content.apply(clean_text)

In [13]:
combined.head()

Unnamed: 0,Label,Content,Clean_Context
0,2.0,second counting input 5 2 which receives inter...,second counting input 5 2 which receives inter...
1,4.0,extremely low temperature of the chips in cold...,extremely low temperature of the chips in cold...
2,3.0,of the basic ammonium salt of the carboxyl ate...,of the basic ammonium salt of the carboxyl ate...
3,9.0,18 u2033 is provided which is axially supporte...,18 u2033 is provided which is axially supporte...
4,2.0,to an u201c inner surface u201d means the surf...,to an u201c inner surface u201d means the surf...


In [15]:
combined["text_len"] = combined.Clean_Context.apply(len)

In [16]:
combined.head()

Unnamed: 0,Label,Content,Clean_Context,text_len
0,2.0,second counting input 5 2 which receives inter...,second counting input 5 2 which receives inter...,1504
1,4.0,extremely low temperature of the chips in cold...,extremely low temperature of the chips in cold...,1251
2,3.0,of the basic ammonium salt of the carboxyl ate...,of the basic ammonium salt of the carboxyl ate...,1345
3,9.0,18 u2033 is provided which is axially supporte...,18 u2033 is provided which is axially supporte...,1569
4,2.0,to an u201c inner surface u201d means the surf...,to an u201c inner surface u201d means the surf...,1733


In [30]:
newtrain = combined.loc[0:train.shape[0]-1, :]
newtest = combined.loc[train.shape[0]:, :]

In [32]:
newtest.drop("Label", axis  = 1, inplace = True)

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec

from gensim.utils import simple_preprocess

# Assume 'texts' is a list containing your documents


# Tokenization using Gensim
tokenized_train_docs = [simple_preprocess(text, deacc=True) for text in newtrain['Clean_Context']]  # `deacc=True` removes punctuations

# Tokenization using Gensim
tokenized_test_docs = [simple_preprocess(text, deacc=True) for text in newtest['Clean_Context']]  # `deacc=True` removes punctuations

In [57]:
from gensim.models import Word2Vec
# ... your code for training the Word2Vec model ...
word2vec_model = Word2Vec(sentences=tokenized_test_docs,
                          vector_size=100, window=5, min_count=1, workers=4)


def document_vector(word2vec_model, doc):
    # Remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.key_to_index]
    if len(doc) == 0:
        return np.zeros(word2vec_model.vector_size)
    # Average the word vectors for a document
    return np.mean(word2vec_model.wv[doc], axis=0)

# Assuming `tokenized_train_docs` is a list of tokenized documents
vector_train = np.array([document_vector(word2vec_model, doc) for doc in tokenized_train_docs])


In [65]:
for i, vector in enumerate(vector_test):
    if isinstance(vector, np.ndarray):
        continue
    elif isinstance(vector, np.float64):
        print(f"Index {i} is a float, not an array.")
    else:
        print(f"Index {i} has type {type(vector)}.")

In [60]:
# Assuming you know the dimensionality of your word vectors, e.g., 100
vector_size = 100  # replace with the correct size of your vectors
vectors_train_corrected = np.zeros((len(vectors_train), vector_size))

for i, vector in enumerate(vectors_train):
    if isinstance(vector, np.float64):
        vectors_train_corrected[i] = np.zeros(vector_size)  # a zero vector
    else:
        vectors_train_corrected[i] = vector  # the original vector


In [62]:
# Train a Word2Vec model on Train Set

word2vec_model = Word2Vec(sentences=tokenized_test_docs,
                          vector_size=100, window=5, min_count=1, workers=4)

def document_vector(word2vec_model, doc):
    # Remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.key_to_index]
    if len(doc) == 0:
        return np.zeros(word2vec_model.vector_size)
    # Average the word vectors for a document
    return np.mean(word2vec_model.wv[doc], axis=0)
    
# Average the word vectors for a document
vector_test = np.array([document_vector(word2vec_model, doc) for doc in tokenized_test_docs])

In [50]:
newtrain['Label'] = newtrain['Label'].astype(int)

In [69]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(vectors_train_corrected, newtrain['Label'], 
                                                    test_size=0.2, random_state=42)

from sklearn.ensemble import GradientBoostingClassifier

# Train a classifier
classifier = GradientBoostingClassifier()
predictions = classifier.fit(X_train, y_train).predict(X_test)

# Evaluate the classifier
predictions = classifier.predict(X_test)

In [70]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           1       0.43      0.37      0.40      1356
           2       0.27      0.26      0.27      1322
           3       0.55      0.77      0.65      1221
           4       0.59      0.52      0.55      1271
           5       0.43      0.53      0.48      1276
           6       0.43      0.55      0.48      1331
           7       0.48      0.54      0.50      1398
           8       0.56      0.59      0.57      1449
           9       0.24      0.04      0.06      1200

    accuracy                           0.47     11824
   macro avg       0.44      0.46      0.44     11824
weighted avg       0.44      0.47      0.44     11824



In [84]:
# Encode the labels
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.utils import to_categorical

from sklearn.feature_extraction.text import TfidfVectorizer

# Perform TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Limit the number of features to 1000 for simplicity
X_tfidf = tfidf_vectorizer.fit_transform(newtrain['Clean_Content']).toarray()


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(newtrain['Label'])
y_categorical = to_categorical(y_encoded)

y_train_encoded = to_categorical(y_train)
y_test_encoded = to_categorical(y_test)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_categorical, test_size=0.2, random_state=42)

# Define the neural network structure
model = Sequential()
model.add(Dense(512, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(y_train.shape[1], activation='softmax'))  # The output layer size should match the number of labels

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1)

# Evaluate the model on test data
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy*100:.2f}%')

KeyError: 'Clean_Content'

In [82]:
model = Sequential()
model.add(Dense(512, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dense(256, activation='relu'))
# Use y_train_encoded.shape[1] to determine the output layer size
model.add(Dense(y_train_encoded.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Use y_train_encoded here
model.fit(X_train, y_train_encoded, epochs=10, batch_size=64, validation_split=0.1)

# Use y_test_encoded to evaluate the model
loss, accuracy = model.evaluate(X_test, y_test_encoded)
print(f'Test Accuracy: {accuracy*100:.2f}%')


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 50.43%


(47295,)