In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import sklearn
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")


#preprocessing
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import MultiLabelBinarizer

#model building
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

#model evaluation
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, recall_score

In [2]:
df_data = pd.read_csv('blogtext.csv')
df_data

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...
...,...,...,...,...,...,...,...
681279,1713845,male,23,Student,Taurus,"01,July,2004","Dear Susan, I could write some really ..."
681280,1713845,male,23,Student,Taurus,"01,July,2004","Dear Susan, 'I have the second yeast i..."
681281,1713845,male,23,Student,Taurus,"01,July,2004","Dear Susan, Your 'boyfriend' is fuckin..."
681282,1713845,male,23,Student,Taurus,"01,July,2004","Dear Susan: Just to clarify, I am as..."


In [3]:
df = df_data.sample(n = 1000)
df.reset_index(drop='first',inplace= True)

In [4]:
df

Unnamed: 0,id,gender,age,topic,sign,date,text
0,1334509,female,23,Accounting,Taurus,"05,August,2004",::: CHEER UP BITCH ::: I needed a ...
1,870139,female,24,Non-Profit,Gemini,"27,April,2003",a what??? Yes...a door-handle-less ...
2,3369833,male,23,Tourism,Sagittarius,"05,July,2004",Monday : relaxed and fun start ...
3,1593902,male,15,Student,Taurus,"15,August,2003",If you said that you could kill the...
4,1859920,male,26,indUnk,Cancer,"18,September,2003","Oh my god where to start, well its ..."
...,...,...,...,...,...,...,...
995,3654939,male,27,indUnk,Cancer,"12,July,2004",Jeff and John's NFL weekly Picks Your ...
996,2664554,male,35,indUnk,Libra,"12,January,2004",Just saw a news flash on the internet t...
997,2534568,male,17,Education,Leo,"10,June,2004",Wow! I just spent about two ...
998,1414354,male,24,Non-Profit,Leo,"01,June,2004",I know that everyone has heard at...


In [5]:
df.text

0              ::: CHEER UP BITCH :::     I needed a ...
1               a what???   Yes...a door-handle-less ...
2                    Monday :  relaxed and fun start ...
3                 If you said that you could kill the...
4                 Oh my god where to start, well its ...
                             ...                        
995           Jeff and John's NFL weekly Picks  Your ...
996           Just saw a news flash on the internet t...
997                      Wow! I just spent about two ...
998                 I know that everyone has heard at...
999               Well, i now own my Harry potter and...
Name: text, Length: 1000, dtype: object

In [6]:
def preprocess_text(text):

    # Remove unwanted characters
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert text to lowercase
    text = text.lower()

    # Remove unwanted spaces
    text = " ".join(text.split())

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_text]

    # Join the processed words back into a single string
    processed_text = " ".join(lemmatized_text)

    return processed_text

In [7]:
df['text'] = df['text'].apply(preprocess_text)

In [8]:
df.text

0      cheer bitch needed good laugh damit got one bi...
1      yesa doorhandleless toyota camrythats said dri...
2      monday relaxed fun start race wed agreed pyro ...
3      said could kill thing inside youre liar everyt...
4      oh god start well since posted reason hey much...
                             ...                        
995    jeff john nfl weekly pick favorite college foo...
996    saw news flash internet kartik iyer heading in...
997    wow spent two hour putting together post james...
998    know everyone heard one point another fiasco u...
999    well harry potter order phenix book yipee hair...
Name: text, Length: 1000, dtype: object

In [9]:
# Merge the columns into a single column
df['labels'] = df.apply(lambda row: [row['gender'], str(row['age']), row['topic'], row['sign']], axis=1)
df['labels']

0      [female, 23, Accounting, Taurus]
1      [female, 24, Non-Profit, Gemini]
2      [male, 23, Tourism, Sagittarius]
3           [male, 15, Student, Taurus]
4            [male, 26, indUnk, Cancer]
                     ...               
995          [male, 27, indUnk, Cancer]
996           [male, 35, indUnk, Libra]
997          [male, 17, Education, Leo]
998         [male, 24, Non-Profit, Leo]
999       [male, 16, Technology, Virgo]
Name: labels, Length: 1000, dtype: object

In [10]:
df.drop(columns=['id','gender','age','topic','sign','date'],inplace= True)

In [11]:
df.columns

Index(['text', 'labels'], dtype='object')

In [12]:
df

Unnamed: 0,text,labels
0,cheer bitch needed good laugh damit got one bi...,"[female, 23, Accounting, Taurus]"
1,yesa doorhandleless toyota camrythats said dri...,"[female, 24, Non-Profit, Gemini]"
2,monday relaxed fun start race wed agreed pyro ...,"[male, 23, Tourism, Sagittarius]"
3,said could kill thing inside youre liar everyt...,"[male, 15, Student, Taurus]"
4,oh god start well since posted reason hey much...,"[male, 26, indUnk, Cancer]"
...,...,...
995,jeff john nfl weekly pick favorite college foo...,"[male, 27, indUnk, Cancer]"
996,saw news flash internet kartik iyer heading in...,"[male, 35, indUnk, Libra]"
997,wow spent two hour putting together post james...,"[male, 17, Education, Leo]"
998,know everyone heard one point another fiasco u...,"[male, 24, Non-Profit, Leo]"


In [13]:
# Separate features and labels
X = df['text']  # Features
y = df['labels']  # Labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=47)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# Assuming you have the training and testing features as X_train and X_test

# Create the count vectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2))

# Fit and transform the training features
X_train_cvectorized = vectorizer.fit_transform(X_train)

# Transform the testing features
X_test_cvectorized = vectorizer.transform(X_test)

# Print the term-document matrix
print("Term-Document Matrix:")
print(X_train_cvectorized.toarray())

Term-Document Matrix:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [15]:
label_counts = dict()

for Labels in df.labels.values:
    for label in Labels:
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1
            
label_counts

{'female': 498,
 '23': 109,
 'Accounting': 10,
 'Taurus': 99,
 '24': 116,
 'Non-Profit': 21,
 'Gemini': 87,
 'male': 502,
 'Tourism': 7,
 'Sagittarius': 72,
 '15': 63,
 'Student': 223,
 '26': 90,
 'indUnk': 362,
 'Cancer': 91,
 '27': 66,
 'Technology': 67,
 '33': 26,
 'Scorpio': 81,
 '44': 3,
 'Aquarius': 76,
 '43': 11,
 '14': 45,
 'Virgo': 88,
 '16': 104,
 '25': 91,
 'Leo': 80,
 '17': 101,
 'Pisces': 72,
 '39': 10,
 '35': 34,
 'Aries': 112,
 'Capricorn': 65,
 'Libra': 77,
 '42': 4,
 'Engineering': 19,
 'Arts': 53,
 'Government': 10,
 'Internet': 26,
 '34': 29,
 'Publishing': 11,
 '37': 10,
 'Communications-Media': 33,
 'Science': 11,
 'Education': 54,
 'Sports-Recreation': 5,
 '41': 6,
 'Religion': 7,
 '36': 17,
 'HumanResources': 2,
 '38': 13,
 'Law': 12,
 'Telecommunications': 7,
 '13': 17,
 'Fashion': 4,
 '47': 8,
 'BusinessServices': 5,
 '40': 11,
 'Manufacturing': 4,
 'RealEstate': 6,
 'Advertising': 8,
 'Banking': 5,
 'Museums-Libraries': 2,
 'InvestmentBanking': 1,
 'Architectu

In [16]:
# Convert your train and test labels using MultiLabelBinarizer
mlb = MultiLabelBinarizer(classes=list(label_counts.keys()))
train_labels_binary = mlb.fit_transform(y_train)
test_labels_binary = mlb.transform(y_test)

In [17]:
mlb.classes_

array(['female', '23', 'Accounting', 'Taurus', '24', 'Non-Profit',
       'Gemini', 'male', 'Tourism', 'Sagittarius', '15', 'Student', '26',
       'indUnk', 'Cancer', '27', 'Technology', '33', 'Scorpio', '44',
       'Aquarius', '43', '14', 'Virgo', '16', '25', 'Leo', '17', 'Pisces',
       '39', '35', 'Aries', 'Capricorn', 'Libra', '42', 'Engineering',
       'Arts', 'Government', 'Internet', '34', 'Publishing', '37',
       'Communications-Media', 'Science', 'Education',
       'Sports-Recreation', '41', 'Religion', '36', 'HumanResources',
       '38', 'Law', 'Telecommunications', '13', 'Fashion', '47',
       'BusinessServices', '40', 'Manufacturing', 'RealEstate',
       'Advertising', 'Banking', 'Museums-Libraries', 'InvestmentBanking',
       'Architecture', 'Military', '46', 'Biotech', 'Automotive',
       'Marketing', '48', '45', 'Transportation', 'Chemicals', 'Maritime',
       'Consulting', 'Environment', 'LawEnforcement-Security',
       'Construction'], dtype=object)

In [18]:
# Create a LogisticRegression classifier
base_classifier = LogisticRegression(solver='lbfgs')

# Wrap the base classifier in OneVsRestClassifier
classifier = OneVsRestClassifier(base_classifier)

# Train the classifier on your training data
classifier.fit(X_train_cvectorized, train_labels_binary)

OneVsRestClassifier(estimator=LogisticRegression())

In [19]:
# Make predictions on the test data
predictions = classifier.predict(X_test_cvectorized)

# Calculate evaluation metrics
accuracy = accuracy_score(test_labels_binary, predictions)
f1 = f1_score(test_labels_binary, predictions, average='micro')
avg_precision = average_precision_score(test_labels_binary, predictions, average='micro')
avg_recall = recall_score(test_labels_binary, predictions, average='micro')

# Print the evaluation metrics
print("Accuracy Score:", accuracy)
print("F1 Score:", f1)
print("Average Precision Score:", avg_precision)
print("Average Recall Score:", avg_recall)

Accuracy Score: 0.0
F1 Score: 0.2409867172675522
Average Precision Score: 0.12196993670886076
Average Recall Score: 0.15875


In [20]:
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, recall_score

# Fit the classifier on your training data
classifier.fit(X_train_cvectorized, train_labels_binary)

# Make predictions on the test data
predictions = classifier.predict(X_test_cvectorized)

# Calculate evaluation metrics
accuracy = accuracy_score(test_labels_binary, predictions)
f1_micro = f1_score(test_labels_binary, predictions, average='micro')
f1_macro = f1_score(test_labels_binary, predictions, average='macro')
f1_weighted = f1_score(test_labels_binary, predictions, average='weighted')
avg_precision_micro = average_precision_score(test_labels_binary, predictions, average='micro')
avg_precision_macro = average_precision_score(test_labels_binary, predictions, average='macro')
avg_precision_weighted = average_precision_score(test_labels_binary, predictions, average='weighted')
avg_recall_micro = recall_score(test_labels_binary, predictions, average='micro')
avg_recall_macro = recall_score(test_labels_binary, predictions, average='macro')
avg_recall_weighted = recall_score(test_labels_binary, predictions, average='weighted')

# Print the evaluation metrics
print("Accuracy Score:", accuracy)
print("F1 Score (micro):", f1_micro)
print("F1 Score (macro):", f1_macro)
print("F1 Score (weighted):", f1_weighted)
print("Average Precision Score (micro):", avg_precision_micro)
print("Average Precision Score (macro):", avg_precision_macro)
print("Average Precision Score (weighted):", avg_precision_weighted)
print("Average Recall Score (micro):", avg_recall_micro)
print("Average Recall Score (macro):", avg_recall_macro)
print("Average Recall Score (weighted):", avg_recall_weighted)

Accuracy Score: 0.0
F1 Score (micro): 0.2409867172675522
F1 Score (macro): 0.019047729248370622
F1 Score (weighted): 0.16728699099503988
Average Precision Score (micro): 0.12196993670886076
Average Precision Score (macro): nan
Average Precision Score (weighted): 0.22188965693947527
Average Recall Score (micro): 0.15875
Average Recall Score (macro): 0.017533681860948114
Average Recall Score (weighted): 0.15875


In [21]:
# Convert the predicted labels back to their original format
predicted_labels = mlb.inverse_transform(predictions)
true_labels = mlb.inverse_transform(test_labels_binary)

# Select five random indices
random_indices = np.random.choice(len(true_labels), size=5, replace=False)

# Print the true and predicted labels for the selected examples
for i in random_indices:
    print("Example:",i+1)
    print("Title:", X_test.iloc[i+1])
    print("True Label:", true_labels[i])
    print("Predicted Label:", predicted_labels[i])
    print()


Example: 156
Title: relfection 5 day practicum saturday morning feeling thursday afternoon become reality got cold flu week interesting really need think implication observation set plan following week observation week gone havent much except thought show mean better get good mileage thought time planned productive following week despite experience limited experience think supervising teacher teaching lot desired example poor classroom management abound two example cite order oneonone math reading testing repeatedly given class loose instruction play thing work within 5 minute naturally class thing shouldnt yell top voice dare heart sore lecture learn give poor instruction mistake kid get diverted think make sense useful activity available prepared would best way go planning day lesson obvious 1on1 work need something engage student getting play productive get kid excited another case oneonone testing asked child tested number come 7 quickly student playing something else come running 