##Mounting Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

###Loading Word2Vec API

In [None]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

###Testing Word2Vec

In [None]:
wv.similarity(w1="great", w2="good")

In [None]:
wv_great = wv["great"]
wv_good = wv["good"]


In [None]:
wv_great.shape, wv_good.shape

###Installing Spacy And Dependencies

In [None]:
import pandas as pd
import numpy as np

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
!python -m spacy download en_core_web_lg

In [None]:
import spacy
import numpy as np

nlp = spacy.load("en_core_web_lg")

def preprocess_and_vectorize(text):
    # Handle float values
    if isinstance(text, float):
        return np.zeros(300)  # Return an array of zeros with the appropriate dimensionality (300 in this case)

    # Remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        if token.has_vector:
            filtered_tokens.append(token.lemma_)

    # If no valid tokens found, return a default vector
    if not filtered_tokens:
        return np.zeros(300)  # Return an array of zeros with the appropriate dimensionality (300 in this case)

    return wv.get_mean_vector(filtered_tokens)


###Testing Pre-Process Function

In [None]:
v = preprocess_and_vectorize("Don't worry if you don't understand")
v.shape

###Loading Dataset And Creating DataFrame

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/NLP Research/Dataset/train_data.csv')
train_df

###Printing First 5 Rows

In [None]:
train_df.head()

###Printing Last 5 Rows

In [None]:
train_df.tail()

###Check Distribution Of Labels

In [None]:
#check the distribution of labels
train_df['Label'].value_counts()


###Creating Vectors For Text

In [None]:
train_df['vector'] = train_df['Text data'].apply(lambda text: preprocess_and_vectorize(text))

###Dropping Rows With Null Vectors

In [None]:
# Drop rows where 'vector' is None
#train_df.dropna(subset=['vector'], inplace=True)


###Encoding The Labels

In [None]:
#Add the new column which gives a unique number to each of these labels

train_df['label_num'] = train_df['Label'].map({'moderate' : 0, 'not depression': 1, 'severe': 2})

#check the results with top 5 rows
train_df.head(5)

##Cross-Validation With Dev Dataset

###Loading Dev Dataset

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/NLP Research/Dataset/Test_data_with_labels - test_data.csv')
test_df

In [None]:
dev_df = pd.read_csv('/content/drive/MyDrive/NLP Research/Dataset/dev_data.csv')
dev_df

###Printing First 5 Rows

In [None]:
test_df.head()

In [None]:
dev_df.head()

###Printing Last 5 Rows

In [None]:
test_df.tail()

In [None]:
dev_df.tail()

###Check Distribution Of Labels

In [None]:
test_df['Label'].value_counts()

In [None]:
#check the distribution of labels
dev_df['Class labels'].value_counts()


###Encoding The Labels

In [None]:
#Add the new column which gives a unique number to each of these labels

dev_df['label_num'] = dev_df['Class labels'].map({'moderate' : 0, 'not depression': 1, 'severe': 2})

#check the results with top 5 rows
dev_df.head(5)

Unnamed: 0,Pid,text data,Class labels,label_num
0,dev_pid_1,Im scared : This is it. I lie to myself every ...,moderate,0
1,dev_pid_2,New to this but just wanted to vent : I just f...,moderate,0
2,dev_pid_3,I’m sad : It’s kinda always been an issue. I w...,moderate,0
3,dev_pid_4,Lonely but not alone. : All of my immediately ...,moderate,0
4,dev_pid_5,This year has been trash. : I dont know why I’...,moderate,0


In [None]:
#Add the new column which gives a unique number to each of these labels

test_df['label_num'] = test_df['Label'].map({'moderate' : 0, 'not depression': 1, 'severe': 2})

#check the results with top 5 rows
test_df.head(5)

###Creating Vectors For Text

In [None]:
test_df['vector'] = test_df['Text data'].apply(lambda text: preprocess_and_vectorize(text))

In [None]:
dev_df['vector'] = dev_df['text data'].apply(lambda text: preprocess_and_vectorize(text))

###Dropping Rows With Null Vectors

In [None]:
# Drop rows where 'vector' is None
#dev_df.dropna(subset=['vector'], inplace=True)


###Creating Train And Test Values

In [None]:
X_train = train_df['vector']
Y_train = train_df['label_num']


X_test = test_df['vector']
Y_test = test_df['label_num']

In [None]:
X_train = train_df['vector']
Y_train = train_df['label_num']


X_test = dev_df['vector']
Y_test = dev_df['label_num']

###Modifying Values To Fit To Model

In [None]:
print("Shape of X_train before reshaping: ", X_train.shape)
print("Shape of X_test before reshaping: ", X_test.shape)


X_train_2d = np.stack(X_train)
X_test_2d =  np.stack(X_test)

print("Shape of X_train after reshaping: ", X_train_2d.shape)
print("Shape of X_test after reshaping: ", X_test_2d.shape)

###Classifying The Results Using SVC

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# 1. Creating an SVM model object
clf = SVC()

# 2. Fit with X_train_2d and Y_train
clf.fit(X_train_2d, Y_train)

# 3. Get the predictions for X_test_2d and store it in y_pred
y_pred = clf.predict(X_test_2d)

# 4. Print the classification report
print(classification_report(Y_test, y_pred))


##Predicting Values Using Test Dataset

###Loading Test Dataaset

In [None]:
test_df = pd.read_csv('/content/drive/MyDrive/NLP/test_data - test_data.csv')
test_df

###Printing First 5 Rows

In [None]:
test_df.head()

###Printing Last 5 Rows

In [None]:
test_df.tail()

In [None]:
test_df= test_df.fillna('')

###Creating Vectors For Text

In [None]:
test_df['vector'] = test_df['Text data'].apply(lambda text: preprocess_and_vectorize(text))

###Dropping Rows With Null Vectors

In [None]:
import pandas as pd
import numpy as np

# Iterate over the rows of the DataFrame
for i, row in test_df.iterrows():
    vector = row['vector']
    # Check if vector is not None
    if vector is None:
        print(row)




In [None]:
# Drop rows where 'vector' is None
test_df.dropna(subset=['vector'], inplace=True)


###Predicting Values

In [None]:
X_train = train_df['vector']
Y_train = train_df['label_num']

X_test = test_df['vector']

###Modifying To Fit To Model

In [None]:
print("Shape of X_train before reshaping: ", X_train.shape)
print("Shape of X_test before reshaping: ", X_test.shape)


X_train_2d = np.stack(X_train)
X_test_2d =  np.stack(X_test)

print("Shape of X_train after reshaping: ", X_train_2d.shape)
print("Shape of X_test after reshaping: ", X_test_2d.shape)

###Classifying The Result

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# 1. Creating an SVM model object
clf = SVC()

# 2. Fit with X_train_2d and Y_train
clf.fit(X_train_2d, Y_train)

# 3. Get the predictions for X_test_2d and store it in y_pred
y_pred = clf.predict(X_test_2d)



In [None]:

# Create a mapping dictionary
label_map = {0: 'moderate', 1: 'not depression', 2: 'severe'}

# Convert the predicted labels to their corresponding categories
predicted_labels = [label_map[label] for label in y_pred]

# Create the 'predictions' column in the test_df DataFrame
test_df['predictions'] = pd.Series(predicted_labels)

In [None]:
test_df

###Converting To TSV Format

In [None]:
# Rename columns
test_df = test_df.rename(columns={'Pid': 'pid', 'predictions': 'class_label'})

# Remove columns
test_df = test_df.drop(['Text data', 'vector'], axis=1)


In [None]:
test_df

###Converting To TSV Formats

###Run1-Bert

In [None]:
run1 = pd.read_csv("/content/test_predictions.csv")
run1

In [None]:
# Rename columns
run1 = run1.rename(columns={'Pid': 'pid', 'predicted_label': 'class_label'})

# Remove columns
run1 = run1.drop(['Text data'], axis=1)


In [None]:
run1

In [None]:
run1.to_csv("TechSSN1_run1.tsv",index=False,sep="\t")

###Run2-Word2Vec

In [None]:
test_df.to_csv("TechSSN1_run2.tsv",index=False,sep="\t")

###Run3-Vectorize

In [None]:
run3 = pd.read_csv("/content/output_file_LSVC.csv")
run3

In [None]:
# Rename columns
run3 = run3.rename(columns={'Pid': 'pid', 'Predicted_Label': 'class_label'})

# Remove columns
run3 = run3.drop(['Unnamed: 0'], axis=1)


In [None]:
run3

In [None]:
run3.to_csv("TechSSN1_run3.tsv",index=False,sep="\t")

##Comparing Values


In [None]:
predicted_df = pd.read_csv('/content/drive/MyDrive/NLP Research/Dataset/output.csv')
predicted_df

In [None]:
predicted_df.rename(columns={' Pid': 'Pid'}, inplace=True)

In [None]:
print(predicted_df.columns)

Index(['Pid', 'class_label'], dtype='object')


In [None]:
actual_df = pd.read_csv('/content/drive/MyDrive/NLP Research/Dataset/Test_data_with_labels - test_data.csv')
actual_df

In [None]:
print(predicted_df.columns)
print(actual_df.columns)

Index([' Pid', 'class_label'], dtype='object')
Index(['Pid', 'Text data', 'Label'], dtype='object')


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

# Read the predicted CSV file
predicted_df = pd.read_csv('/content/drive/MyDrive/NLP Research/Dataset/output.csv')
predicted_df.rename(columns={' Pid': 'Pid'}, inplace=True)
# Read the actual labels CSV file
actual_df = pd.read_csv('/content/drive/MyDrive/NLP Research/Dataset/Test_data_with_labels - test_data.csv')

# Merge the two dataframes on the text column
merged_df = pd.merge(predicted_df, actual_df, on='Pid')

# Extract the predicted and actual labels
predicted_labels = merged_df['class_label']
actual_labels = merged_df['Label']

# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)
print('Accuracy:', accuracy)

# Generate classification report
report = classification_report(actual_labels, predicted_labels)
print('Classification Report:')
print(report)
