**Psycological Analysis**
This model aims at predicting the psycological problem that a patient faces according to the information provided by the patient. In the models I used three input parameters on the basis of which the model is predicting the problem faced by the patient. These parameters are:-


1.   Age
2.   Gender
3.   Problem Description

I used method of NLP and vectorization of data an predict patterns based on data to predict the problem faced by the patient.





In [8]:
import spacy
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV

In [9]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
np.set_printoptions(threshold=np.inf)

In [11]:
# Load the data
file = pd.read_csv("preprocessed_data.csv")
sw = stopwords.words('english') + ['.', '?', ',', "'", '-', " "]
port_stem = PorterStemmer()
nlp = spacy.load('en_core_web_sm')
cv = CountVectorizer()

data = list(file["Problem_description"])

In [12]:
# Text preprocessing
def extract_sents(data):
    extracted_sents = []
    for sent in data:
        doc = nlp(sent)
        lemmatized_words = [token.lemma_ for token in doc if token.text.lower() not in sw]
        extracted_sents.append(" ".join(lemmatized_words))
    return list(extracted_sents)

extract_sents = extract_sents(data)

In [13]:
# Vectorize the data
vc = cv.fit_transform(extract_sents)
vectorized_sens = list(vc.toarray())
np_vectorized_sens = np.array(vectorized_sens)

# ... (Code to save vocabulary omitted for brevity)

In [14]:
# Prepare features and labels
data_gender = list(file["Gender"])
cv_Gender = CountVectorizer()
vc_Gender = cv_Gender.fit_transform(data_gender)
np_Gender = np.array(vc_Gender.toarray()[:, 1])

le = LabelEncoder()
data_problem = list(file["psychological_catehory"])
fit = np.array(le.fit_transform(data_problem))
classes = le.classes_
np.save("classes.npy", classes)

data_age = np.array(file["Age"])
final_data = np.hstack((data_age.reshape(-1, 1), np_Gender.reshape(-1, 1), np_vectorized_sens, fit.reshape(-1, 1)))
np.save("final_data.npy", final_data)

In [15]:
# Oversample the minority class
ros = RandomOverSampler(random_state=42)
X, y = final_data[:, :-1], final_data[:, -1]
X_resampled, y_resampled = ros.fit_resample(X, y)

In [16]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [17]:
# Define the models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=1000, random_state=42, max_depth=8),
    'Logistic Regression': LogisticRegression(random_state=42),
    'SVM': SVC(kernel='linear', random_state=42)
}

In [18]:
# Hyperparameter tuning using GridSearchCV
param_grids = {
    'Random Forest': {'n_estimators': [500, 1000, 1500], 'max_depth': [4, 8, 12]},
    'Logistic Regression': {'C': [0.1, 1, 10]},
    'SVM': {'C': [0.1, 1, 10]}
}

In [19]:
best_models = {}
for name, model in models.items():
    print(f"Tuning hyperparameters for {name}...")
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    print(f"Best hyperparameters: {grid_search.best_params_}")

Tuning hyperparameters for Random Forest...
Best hyperparameters: {'max_depth': 12, 'n_estimators': 1000}
Tuning hyperparameters for Logistic Regression...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best hyperparameters: {'C': 0.1}
Tuning hyperparameters for SVM...
Best hyperparameters: {'C': 0.1}


In [20]:
# Evaluate the models
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy}")

Random Forest Accuracy: 0.9861495844875346
Logistic Regression Accuracy: 0.9889196675900277
SVM Accuracy: 0.9916897506925207


In [21]:
# Train the models
for name, model in models.items():
    model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
class Text_Processing:
    def __init__(self, age, gender, problem_description):
        self.age = age
        self.gender = gender
        self.problem_description = problem_description

    def extract_sents(self):
        extracted_sents = []
        for sent in self.problem_description:
            doc = nlp(sent)
            lemmatized_words = [token.lemma_ for token in doc if token.text.lower() not in sw]
            extracted_sents.append(" ".join(lemmatized_words))
        return list(extracted_sents)

    def vectorize_sents(self):
        extract_sents = self.extract_sents()
        vc = cv.transform(extract_sents)
        vectorized_sens = list(vc.toarray())
        np_vectorized_sens = np.array(vectorized_sens)
        return np_vectorized_sens

    def processing(self):
        # Processing Gender
        if self.gender.lower() == "male":
            gender = 0
        else:
            gender = 1
        gender = np.array([gender]).reshape(-1, 1)
        age = np.array([self.age]).reshape(-1, 1)
        vectorized_sens = self.vectorize_sents()
        final_stack = np.hstack((age, gender, vectorized_sens))
        return final_stack

In [23]:
# Get user input
age = int(input("Enter your age: "))
gender = input("Enter your gender: ")
problem = input("Enter your problem: ")

# Process user input
obj = Text_Processing(age, gender, [problem])
value = obj.processing()

# Predict the psychological problem
problem_value = models['SVM'].predict(value)

# Convert problem_value to original form
classes = np.load("classes.npy")
problem_value = classes[problem_value[0]]

print(f"Your problem is related to {problem_value}")

Enter your age: 27
Enter your gender: m
Enter your problem: I am having trouble focusing on the job
Your problem is related to Anxiety
