In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [3]:
# Load data from CSV file
df = pd.read_csv('filtered.csv')
df.head()


Unnamed: 0,questionText,topics
0,I had a head injury a few years ago and my min...,Anxiety
1,Me and my girlfriend just broke up. She said s...,Relationships
2,I have been with this guy on and off for 8 yea...,Relationships
3,"I have been with a guy for 4 years, he's a gr...",Relationships
4,What do I do if I have been feeling like I cou...,Relationships


In [4]:
# Drop rows with null values in either questionText or topics
df.dropna(subset=['questionText', 'topics'], inplace=True)

In [5]:
# Clean tweet text
def clean_question_text(text):
    # Remove URLs from tweet text
    text = re.sub(r'http\S+', '', text)
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Tokenize tweet text
    tokens = word_tokenize(text.lower())
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Return cleaned tweet text
    return ' '.join(tokens)

In [6]:
# Clean question text
df['clean_question_text'] = df['questionText'].apply(clean_question_text)

In [7]:
# Split data into training and testing sets
train = df.sample(frac=0.8, random_state=1)
test = df.drop(train.index)

In [8]:
# Convert cleaned question text into TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train = vectorizer.fit_transform(train['clean_question_text']).toarray()
X_test = vectorizer.transform(test['clean_question_text']).toarray()

In [9]:
# Create label vectors for training and testing sets
y_train = train['topics'].values
y_test = test['topics'].values

In [18]:
# Train a Random Forest classifier on the training data
rfc = RandomForestClassifier(n_estimators=250, random_state=2)
rfc.fit(X_train, y_train)

RandomForestClassifier(n_estimators=250, random_state=2)

In [19]:
# Define function to predict topic from question text
def predict_topic(question_text):
    # Clean question text
    clean_question = clean_question_text(question_text)
    # Convert cleaned question text to TF-IDF matrix
    X = vectorizer.transform([clean_question]).toarray()
    # Predict topic using trained Random Forest classifier
    predicted_topic = rfc.predict(X)[0]
    # Return predicted topic
    return predicted_topic

In [24]:
question = "I don't know how to tell someone how I feel about them. How can I get better at expressing how I feel?"
predicted_topic = predict_topic(question)
print(predicted_topic)

Relationships
