<a href="https://colab.research.google.com/github/TeneikaAskew/SocialPostClassifier/blob/main/Twitter_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## GitHub Setup

In [None]:
import os

def save_and_push_to_github(commit_message="Auto-save from Google Colab"):
    # Add all changes
    os.system("git add .")
    # Commit changes
    os.system(f'git commit -m "{commit_message}"')
    # Push changes to the GitHub repository
    os.system("git push origin main")  # Use 'master' if your default branch is named 'master'

In [None]:
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def autosave(line, cell):
    # Execute the cell
    exec(cell)
    # After execution, save and push to GitHub
    save_and_push_to_github("Autosave after cell execution")

# Twitter

## ETL

In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Define the file path (update this path to your file location in Google Drive)
js_file_path = '/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/tweets.js'  # Update with the correct path
csv_file_path = '/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/converted_tweets.csv'

# Step 3: Load and parse the JSON directly from the file
import json
from datetime import datetime
import pytz
import pandas as pd

with open(js_file_path, 'r', encoding='utf-8') as file:
    content = file.read().split('=', 1)[-1].strip()  # Remove 'window.YTD.tweets.part0 ='
    data = json.loads(content.rstrip(';'))  # Remove trailing semicolon if present

# Step 4: Process the JSON data to extract relevant tweet information
tweet_data = []
for tweet in data:
    tweet_info = tweet.get('tweet', {})

    # Extract specific fields from each tweet
    created_at_utc = tweet_info.get("created_at")
    if created_at_utc:
        # Convert created_at to EST
        utc_time = datetime.strptime(created_at_utc, '%a %b %d %H:%M:%S +0000 %Y')
        est_time = utc_time.replace(tzinfo=pytz.utc).astimezone(pytz.timezone('US/Eastern'))
        est_time_str = est_time.strftime('%Y-%m-%d %H:%M:%S')
    else:
        est_time_str = None

    flattened_tweet = {
        "tweet_id": tweet_info.get("id_str"),
        "created_at": created_at_utc,
        "created_at_est": est_time_str,  # New column with EST time
        "full_text": tweet_info.get("full_text"),
        "favorite_count": tweet_info.get("favorite_count"),
        "retweet_count": tweet_info.get("retweet_count"),
        "in_reply_to_screen_name": tweet_info.get("in_reply_to_screen_name"),
        "lang": tweet_info.get("lang"),
        "source": tweet_info.get("source"),
        "user_mentions": [
            mention.get("screen_name") for mention in tweet_info.get("entities", {}).get("user_mentions", [])
        ] + [tweet_info.get("user", {}).get("screen_name")],
        "hashtags": [
            hashtag.get("text") for hashtag in tweet_info.get("entities", {}).get("hashtags", [])
        ],
        "tweet_url": f"https://twitter.com/teneikaask_you/status/{tweet_info.get('id_str')}",
        "tweet_replied_to": tweet_info.get("in_reply_to_status_id_str")
    }
    tweet_data.append(flattened_tweet)

# Convert to DataFrame
df = pd.DataFrame(tweet_data)

# Step 5: Save the DataFrame as CSV
df.to_csv(csv_file_path, index=False)

print(f'File successfully converted and saved at: {csv_file_path}')

# Load and preprocess the data
file_path = '/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/converted_tweets.csv'
df = pd.read_csv(file_path)

Mounted at /content/drive
File successfully converted and saved at: /content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/converted_tweets.csv


## Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import re


# Step 0: Filter to items with a created date of October 1, 2019 or later
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df = df[df['created_at'] >= '2019-10-01']

# Assigning labels for `resource_type` and `career_area` based on keywords
resource_labels = {
    "entrepreneur": ["entrepreneur", "founder", "business", "startup", "entrepreneurship", "funding","accelerator"],
    "scholarship": ["scholarship"],
    "bootcamp": ["bootcamp"],
    "resume": ["resume"],
    "apprenticeship": ["apprenticeship"],
    "job": ["job opening", "hiring", "position"],
    "upskilling": ["upskill", "learning", "training"],
    "conferences": ["conference", "event", "seminar"],
    "general_discussion": ["discussion", "opinion", "thoughts"]
}

career_labels = {
    "Entrepreneur": ["entrepreneur", "founder", "business", "startup", "entrepreneurship", "funding","accelerator"],
    "Data Analytics": ["data analytics", "data analysis"],
    "AI": ["artificial intelligence", "ai"],
    "Data Engineering": ["data engineering", "data pipeline"],
    "ServiceNow": ["servicenow"],
    "Salesforce": ["salesforce"],
    "Cloud": ["cloud", "aws", "azure", "gcp"],
    "UX": ["ux", "user experience"],
    "Product Management": ["product management"],
    "Product Design": ["product design"],
    "Project Management": ["project management", "pmp"],
    "Digital Marketing": ["digital marketing"],
    "Software & Systems Engineering": ["software engineering", "systems engineering"],
    "Data": ["data"],
    "Software or Web Development": ["software development", "web development"],
    "Tech Sales": ["tech sales"]
}

# Assign labels based on keywords
def assign_labels(text, label_dict):
    for label, keywords in label_dict.items():
        if any(keyword.lower() in str(text).lower() for keyword in keywords):
            return label
    return None

df['resource_type'] = df['full_text'].apply(lambda x: assign_labels(x, resource_labels))
df['career_area'] = df['full_text'].apply(lambda x: assign_labels(x, career_labels))

# Extract links to a new field
df['resource_link'] = df['full_text'].str.extract(r'(https?://\S+)')

# Prepare data for model training - drop rows without labels
labeled_df = df.dropna(subset=['resource_type', 'career_area'])


  df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')


In [None]:
# Step 2: Text vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X = tfidf.fit_transform(labeled_df['full_text'])

# Encode the labels
y_resource = labeled_df['resource_type']
y_career = labeled_df['career_area']

# Split data for resource type prediction
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X, y_resource, test_size=0.2, random_state=42)
X_train_career, X_test_career, y_train_career, y_test_career = train_test_split(X, y_career, test_size=0.2, random_state=42)

# Step 3: Train the classifier for resource type
resource_model = LogisticRegression(max_iter=200)
resource_model.fit(X_train_res, y_train_res)

# Train the classifier for career area
career_model = LogisticRegression(max_iter=200)
career_model.fit(X_train_career, y_train_career)

# Step 4: Evaluate both models
y_pred_res = resource_model.predict(X_test_res)
y_pred_career = career_model.predict(X_test_career)

# Classification reports
print("Resource Type Classification Report")
print(classification_report(y_test_res, y_pred_res))
print(f"Resource Type Accuracy: {accuracy_score(y_test_res, y_pred_res):.2f}")

print("\nCareer Area Classification Report")
print(classification_report(y_test_career, y_pred_career))
print(f"Career Area Accuracy: {accuracy_score(y_test_career, y_pred_career):.2f}")


Resource Type Classification Report
                    precision    recall  f1-score   support

    apprenticeship       1.00      0.62      0.77        24
          bootcamp       0.95      0.83      0.88        46
       conferences       0.94      0.48      0.64        33
      entrepreneur       0.79      0.94      0.86       147
general_discussion       0.00      0.00      0.00         6
               job       0.90      0.82      0.86        68
            resume       0.94      0.60      0.73        25
       scholarship       1.00      0.84      0.92        45
        upskilling       0.85      0.99      0.92       202

          accuracy                           0.87       596
         macro avg       0.82      0.68      0.73       596
      weighted avg       0.87      0.87      0.86       596

Resource Type Accuracy: 0.87

Career Area Classification Report
                                precision    recall  f1-score   support

                            AI       0.72   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Accuracy:** 87% for Resource Type, 79% for Career Area.

**Strengths:** High precision for specific categories like "entrepreneur" and "upskilling" in Resource Type.

**Weaknesses:** Low recall for less common categories, with some classes (like "general_discussion" in Career Area) showing zero recall.

**Interpretation:** This model performs well for high-frequency labels but struggles with underrepresented categories, especially in Career Area.

In [None]:
# prompt: show this in a df y_test_career, y_pred_career
pd.DataFrame({'y_test_career': y_test_career, 'y_pred_career': y_pred_career})

Unnamed: 0,y_test_career,y_pred_career
31272,AI,AI
41028,Entrepreneur,Entrepreneur
13048,Entrepreneur,Entrepreneur
4640,Data,Data
28434,Data,AI
...,...,...
115,Entrepreneur,Entrepreneur
14434,Entrepreneur,Entrepreneur
6306,Software & Systems Engineering,AI
20566,AI,AI


## Expansion of Labels

Revision of labels to consistently refine after each run

In [None]:
# Assigning labels for `resource_type` and `career_area` based on keywords
resource_labels = {
    "entrepreneur": ["entrepreneur", "founder", "business", "startup", "entrepreneurship", "funding","accelerator"],
    "scholarship": ["scholarship"],
    "bootcamp": ["bootcamp", "program", "academy", "certificate"],
    "resume": ["resume", "cv", "curriculum vitae"],
    "job": ["job opening", "hiring", "position", "apprenticeship", "internship", "apprentice", "career opportunity", "job posting"],
    "upskilling": ["upskill", "learning", "training", "skill development", "course", "certification"],
    "conferences": ["conference", "event", "seminar", "webinar", "meetup", "workshop", "fireside chat", "fireside", "panel", "summit"],
    "general_discussion": ["discussion", "opinion", "thoughts", "general", "comment", "feedback", "insight"]
}

career_labels = {
    "Entrepreneur": ["entrepreneur", "founder", "business", "startup", "entrepreneurship", "funding","accelerator"],
    "Data Analytics": ["data analytics", "data analysis", "business intelligence", "BI", "data analyst"],
    "AI": ["artificial intelligence", "ai", "machine learning", "ML", "deep learning"],
    "Data Engineering": ["data engineering", "data pipeline", "data engineer", "data infrastructure", "ETL", "big data"],
    "ServiceNow": ["servicenow"],
    "Students": ["student", "students", "early career", "student"],
    "Salesforce": ["salesforce", "crm"],
    "Cloud": ["cloud", "aws", "azure", "gcp", "oracle", "cloud computing"],
    "Cybersecurity": ["cyber", "cybersecurity", "networking", "linux", "soc", "cyber security", "security", "penetration testing"],
    "UX": ["ux", "user experience", "ui", "design thinking", "interface", "human centered", "design", "ui"],
    "Product Management": ["product management", "product manager", "product", "agile"],
    "Product Design": ["product design", "product development"],
    "Project Management": ["project management", "pmp", "project planning"],
    "Digital Marketing": ["digital marketing", "social media", "seo", "content marketing"],
    "Software & Systems Engineering": ["software engineering", "systems engineering", "embedded systems", "systems architect"],
    "Data": ["data", "data science", "statistics"],
    "Software or Web Development": ["software development", "web development", "devops", "frontend", "backend", "full stack", "javascript", "react"],
    "Tech Sales": ["tech sales", "technical sales", "business development"],
    "Finance": ["finance", "financial", "accounting", "investment", "capital"],
    "Tech": ["tech", "technical", "technology", "FAANG", "Silicon Valley"],
    "Non-Tech": ["nontech", "non-tech", "non technical"],
    "GovTech": ["govtech", "gov-tech", "irs", "fema", "dod", "digital corps", "coding it forward", "digital service", "gsa", "tts", "public sector"]
}

# Assign labels based on keywords
def assign_labels(text, label_dict):
    for label, keywords in label_dict.items():
        if any(keyword.lower() in str(text).lower() for keyword in keywords):
            return label
    return None

df['resource_type'] = df['full_text'].apply(lambda x: assign_labels(x, resource_labels))
df['career_area'] = df['full_text'].apply(lambda x: assign_labels(x, career_labels))

# Extract links to a new field
df['resource_link'] = df['full_text'].str.extract(r'(https?://\S+)')

# Prepare data for model training - drop rows without labels
labeled_df = df.dropna(subset=['resource_type', 'career_area'])


In [None]:

# Step 2: Text vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X = tfidf.fit_transform(labeled_df['full_text'])

# Encode the labels
y_resource = labeled_df['resource_type']
y_career = labeled_df['career_area']

# Split data for resource type prediction
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X, y_resource, test_size=0.2, random_state=42)
X_train_career, X_test_career, y_train_career, y_test_career = train_test_split(X, y_career, test_size=0.2, random_state=42)

# Step 3: Train the classifier for resource type
resource_model = LogisticRegression(max_iter=200)
resource_model.fit(X_train_res, y_train_res)

# Train the classifier for career area
career_model = LogisticRegression(max_iter=200)
career_model.fit(X_train_career, y_train_career)

# Step 4: Evaluate both models
y_pred_res = resource_model.predict(X_test_res)
y_pred_career = career_model.predict(X_test_career)

# Classification reports
print("Resource Type Classification Report")
print(classification_report(y_test_res, y_pred_res))
print(f"Resource Type Accuracy: {accuracy_score(y_test_res, y_pred_res):.2f}")

print("\nCareer Area Classification Report")
print(classification_report(y_test_career, y_pred_career))
print(f"Career Area Accuracy: {accuracy_score(y_test_career, y_pred_career):.2f}")


Resource Type Classification Report
                    precision    recall  f1-score   support

          bootcamp       0.92      0.94      0.93       291
       conferences       0.89      0.69      0.78        84
      entrepreneur       0.98      0.86      0.92       149
general_discussion       1.00      0.38      0.55        40
               job       0.92      0.92      0.92       159
            resume       1.00      0.71      0.83        49
       scholarship       0.98      0.84      0.90        68
        upskilling       0.78      0.99      0.87       301

          accuracy                           0.89      1141
         macro avg       0.94      0.79      0.84      1141
      weighted avg       0.90      0.89      0.88      1141

Resource Type Accuracy: 0.89

Career Area Classification Report
                                precision    recall  f1-score   support

                            AI       0.53      0.93      0.67       366
                         Cloud  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Accuracy:** 89% for Resource Type, 64% for Career Area.

**Strengths:** Improved handling of certain career categories, especially with better precision for "AI" and "Cloud."

**Weaknesses:** Many classes in Career Area (e.g., "Data Engineering" and "GovTech") are still not well captured, with low recall across some categories.

**Interpretation:** While Resource Type classification improved slightly, the added complexity reduced accuracy for Career Area, indicating overfitting on expanded labels without sufficient data for each category.

In [None]:
# prompt: show this in a df y_test_career, y_pred_career
pd.DataFrame({'y_test_career': y_test_career, 'y_pred_career': y_pred_career})

Unnamed: 0,y_test_career,y_pred_career
16250,Data Analytics,Data Analytics
7905,AI,Tech
19509,UX,UX
19153,Entrepreneur,Entrepreneur
20079,Data Analytics,Data Analytics
...,...,...
1390,AI,AI
28991,AI,AI
8039,UX,AI
3692,Data Analytics,AI


## Multi-Class Model

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer

# Function to assign multiple labels based on keywords
def assign_multi_labels(text, label_dict):
    labels = []
    for label, keywords in label_dict.items():
        if any(keyword.lower() in str(text).lower() for keyword in keywords):
            labels.append(label)
    return labels if labels else ["general_discussion"]

# Load and preprocess the data
file_path = '/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/converted_tweets.csv'
df = pd.read_csv(file_path)

# Filter to items with a created date of October 1, 2019 or later
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df = df[df['created_at'] >= '2019-10-01']


# Apply multi-label assignment functions
df['resource_type'] = df['full_text'].apply(lambda x: assign_multi_labels(x, resource_labels))
df['career_area'] = df['full_text'].apply(lambda x: assign_multi_labels(x, career_labels))

# Convert labels to multi-label binary format using MultiLabelBinarizer
mlb_resource = MultiLabelBinarizer()
mlb_career = MultiLabelBinarizer()

y_resource = mlb_resource.fit_transform(df['resource_type'])
y_career = mlb_career.fit_transform(df['career_area'])

# Extract links to a new field
df['resource_link'] = df['full_text'].str.extract(r'(https?://\S+)')

  df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')


In [None]:
# Text vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X = tfidf.fit_transform(df['full_text'])

# Split data for resource type prediction
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X, y_resource, test_size=0.2, random_state=42)
X_train_career, X_test_career, y_train_career, y_test_career = train_test_split(X, y_career, test_size=0.2, random_state=42)

# Train multi-label classifier for resource type using Logistic Regression
resource_model = MultiOutputClassifier(LogisticRegression(max_iter=200))
resource_model.fit(X_train_res, y_train_res)

# Train multi-label classifier for career area using Logistic Regression
career_model = MultiOutputClassifier(LogisticRegression(max_iter=200))
career_model.fit(X_train_career, y_train_career)

# Evaluate both models
y_pred_res = resource_model.predict(X_test_res)
y_pred_career = career_model.predict(X_test_career)

# Display classification reports
print("Resource Type Classification Report")
print(classification_report(y_test_res, y_pred_res, target_names=mlb_resource.classes_))

print("\nCareer Area Classification Report")
print(classification_report(y_test_career, y_pred_career, target_names=mlb_career.classes_))


Resource Type Classification Report
                    precision    recall  f1-score   support

          bootcamp       1.00      0.82      0.90       431
       conferences       1.00      0.54      0.70       188
      entrepreneur       1.00      0.44      0.61       159
general_discussion       0.96      1.00      0.98      6389
               job       1.00      0.71      0.83       297
            resume       1.00      0.54      0.70       113
       scholarship       1.00      0.69      0.82       103
        upskilling       1.00      0.90      0.95       532

         micro avg       0.97      0.94      0.95      8212
         macro avg       1.00      0.70      0.81      8212
      weighted avg       0.97      0.94      0.95      8212
       samples avg       0.96      0.95      0.95      8212


Career Area Classification Report
                                precision    recall  f1-score   support

                            AI       1.00      0.56      0.72      1229
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Accuracy:** 95% for Resource Type (samples avg), 79% for Career Area (samples avg).

**Strengths:** Highest accuracy and balanced recall for Resource Type, good micro-average recall for Career Area. This model shows strong performance for multi-label classification, especially for labels that appear in combination.

**Weaknesses:** Some Career Area categories like "Non-Tech," "Digital Marketing," and "Finance" still have low recall and F1-scores.

**Interpretation:** This model benefits from multi-label flexibility, allowing each tweet to be assigned multiple resource and career labels, resulting in more accurate predictions. However, Career Area categories with limited training samples still suffer in performance.

In [None]:
# prompt: show the df with the predicted labels

# Predict labels for the entire dataset
y_pred_res_all = resource_model.predict(X)
y_pred_career_all = career_model.predict(X)

# Convert predicted labels back to original format
predicted_resource_labels = mlb_resource.inverse_transform(y_pred_res_all)
predicted_career_labels = mlb_career.inverse_transform(y_pred_career_all)

# Add predicted labels to the DataFrame
df['predicted_resource_labels'] = predicted_resource_labels
df['predicted_career_labels'] = predicted_career_labels

# Show the DataFrame with predicted labels
df[['full_text', 'predicted_resource_labels', 'predicted_career_labels']]

Unnamed: 0,full_text,predicted_resource_labels,predicted_career_labels
0,@abeck617 Facts!,"(general_discussion,)","(general_discussion,)"
1,@LoosCoilz Oh that was a good class!,"(general_discussion,)","(general_discussion,)"
2,@Shellyplus2 Girlllll... that literally be the...,"(general_discussion,)","(general_discussion,)"
3,I get a lot of people asking me how to get the...,"(bootcamp, scholarship, upskilling)","(Cloud,)"
4,@LoosCoilz Mine had rewiring needed 😵‍💫,"(general_discussion,)","(general_discussion,)"
...,...,...,...
41892,@liammotivado @eugenecheang There is an in hou...,"(general_discussion,)","(general_discussion,)"
41893,@TamarBurton Yea this is overall forgiveness.,"(general_discussion,)","(general_discussion,)"
41894,@abhiondemand @clickedco I saw this but it was...,"(general_discussion,)","(general_discussion,)"
41895,RT @tondaylea: Cyber opps!,"(general_discussion,)","(Cybersecurity,)"


## Multi-label Binary Classification

In [None]:
# Vectorize the text
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X = tfidf.fit_transform(df['full_text'])

# Keep tweet_id as a separate variable
tweet_ids = df['tweet_id']

# Split data for both resource and career areas, ensuring tweet_id stays aligned
X_train_res, X_test_res, y_train_res, y_test_res, tweet_ids_train_res, tweet_ids_test_res = train_test_split(
    X, y_resource, tweet_ids, test_size=0.2, random_state=42
)
X_train_career, X_test_career, y_train_career, y_test_career, tweet_ids_train_career, tweet_ids_test_career = train_test_split(
    X, y_career, tweet_ids, test_size=0.2, random_state=42
)

# Train multi-label classifier for resource type using Logistic Regression
resource_model = MultiOutputClassifier(LogisticRegression(max_iter=200))
resource_model.fit(X_train_res, y_train_res)

# Train multi-label classifier for career area using Logistic Regression
career_model = MultiOutputClassifier(LogisticRegression(max_iter=200))
career_model.fit(X_train_career, y_train_career)

# Predict on the test set
y_pred_res = resource_model.predict(X_test_res)
y_pred_career = career_model.predict(X_test_career)

# Combine predictions with tweet IDs for easy joining
resource_results = pd.DataFrame(y_pred_res, columns=mlb_resource.classes_)
resource_results['tweet_id'] = tweet_ids_test_res.values
career_results = pd.DataFrame(y_pred_career, columns=mlb_career.classes_)
career_results['tweet_id'] = tweet_ids_test_career.values

# Display classification reports for both models
print("Resource Type Classification Report")
print(classification_report(y_test_res, y_pred_res, target_names=mlb_resource.classes_))

print("\nCareer Area Classification Report")
print(classification_report(y_test_career, y_pred_career, target_names=mlb_career.classes_))

Resource Type Classification Report
                    precision    recall  f1-score   support

          bootcamp       1.00      0.82      0.90       431
       conferences       1.00      0.54      0.70       188
      entrepreneur       1.00      0.44      0.61       159
general_discussion       0.96      1.00      0.98      6389
               job       1.00      0.71      0.83       297
            resume       1.00      0.54      0.70       113
       scholarship       1.00      0.69      0.82       103
        upskilling       1.00      0.90      0.95       532

         micro avg       0.97      0.94      0.95      8212
         macro avg       1.00      0.70      0.81      8212
      weighted avg       0.97      0.94      0.95      8212
       samples avg       0.96      0.95      0.95      8212


Career Area Classification Report
                                precision    recall  f1-score   support

                            AI       1.00      0.56      0.72      1229
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Accuracy:** 95% for Resource Type (samples avg), 79% for Career Area (samples avg).

**Strengths:** High precision across most Resource Type categories, strong performance on popular Career Area labels like AI, Tech, and general_discussion.

**Weaknesses:** Lower recall for underrepresented Career Area categories, such as Finance, GovTech, and Non-Tech.

**Interpretation:** This multi-label binary approach captures frequent categories well and can handle multiple labels per tweet, but would benefit from more data for less common labels and potentially further tuning.

**Recommendation:** Model 4 shows robust multi-label capabilities, making it the most versatile and accurate choice among the models tested.

In [None]:
# Final combined results with tweet_id to facilitate joining
final_results = pd.merge(resource_results, career_results, on="tweet_id", suffixes=('_resource', '_career'))
final_results.head()

Unnamed: 0,bootcamp,conferences,entrepreneur,general_discussion_resource,job,resume,scholarship,upskilling,tweet_id,AI,...,Project Management,Salesforce,ServiceNow,Software & Systems Engineering,Software or Web Development,Students,Tech,Tech Sales,UX,general_discussion_career
0,0,0,0,1,0,0,0,0,1734602819569852906,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,1,0,0,0,0,1446076794524868617,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,1546866848783810560,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,0,1597582663975866368,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,0,0,1471868566521712655,0,...,0,0,0,0,0,0,0,0,0,1


In [None]:
# Predict labels for the entire original dataset without filtering
y_pred_res_all = resource_model.predict(X)
y_pred_career_all = career_model.predict(X)

# Convert predicted labels back to their original format
predicted_resource_labels = mlb_resource.inverse_transform(y_pred_res_all)
predicted_career_labels = mlb_career.inverse_transform(y_pred_career_all)

# Add predicted labels back to the original DataFrame
df['predicted_resource_labels'] = predicted_resource_labels
df['predicted_career_labels'] = predicted_career_labels

# Display the DataFrame with tweet_id, full_text, and predicted labels
df_with_predictions = df[['tweet_id', 'full_text', 'predicted_resource_labels', 'predicted_career_labels']]
df_with_predictions

Unnamed: 0,tweet_id,full_text,predicted_resource_labels,predicted_career_labels
0,1845998600431481245,@abeck617 Facts!,"(general_discussion,)","(general_discussion,)"
1,1845998377991131222,@LoosCoilz Oh that was a good class!,"(general_discussion,)","(general_discussion,)"
2,1845997893221744658,@Shellyplus2 Girlllll... that literally be the...,"(general_discussion,)","(general_discussion,)"
3,1845997658068209681,I get a lot of people asking me how to get the...,"(bootcamp, scholarship, upskilling)","(Cloud,)"
4,1845842245343129704,@LoosCoilz Mine had rewiring needed 😵‍💫,"(general_discussion,)","(general_discussion,)"
...,...,...,...,...
41892,1580600655252983809,@liammotivado @eugenecheang There is an in hou...,"(general_discussion,)","(general_discussion,)"
41893,1580597002739384337,@TamarBurton Yea this is overall forgiveness.,"(general_discussion,)","(general_discussion,)"
41894,1580590730115891201,@abhiondemand @clickedco I saw this but it was...,"(general_discussion,)","(general_discussion,)"
41895,1580590490210033665,RT @tondaylea: Cyber opps!,"(general_discussion,)","(Cybersecurity,)"


### Remove general_discussion Labels

In [None]:
# Filter out rows where 'general_discussion' appears in predicted labels
df_filtered = df[~df['predicted_resource_labels'].apply(lambda x: 'general_discussion' in x)].copy()
df_filtered_with_predictions = df_filtered[~df_filtered['predicted_career_labels'].apply(lambda x: 'general_discussion' in x)].copy()

# Display tweet_id, full_text, and predicted labels without general_discussion
df_filtered_with_predictions[['tweet_id', 'full_text', 'predicted_resource_labels', 'predicted_career_labels']]
df_filtered_with_predictions

Unnamed: 0,tweet_id,created_at,created_at_est,full_text,favorite_count,retweet_count,in_reply_to_screen_name,lang,source,user_mentions,hashtags,tweet_url,resource_type,career_area,resource_link,predicted_resource_labels,predicted_career_labels
3,1845997658068209681,2024-10-15 01:18:33+00:00,2024-10-14 21:18:33,I get a lot of people asking me how to get the...,105,6,,en,"<a href=""http://twitter.com/download/android"" ...",[None],[],https://twitter.com/teneikaask_you/status/1845...,"[scholarship, bootcamp, upskilling]","[Cloud, Project Management]",,"(bootcamp, scholarship, upskilling)","(Cloud,)"
42,1843261333787349327,2024-10-07 12:05:22+00:00,2024-10-07 08:05:22,RT @KLAZEMATICS: Like this organization that m...,0,0,,en,"<a href=""http://twitter.com/download/android"" ...","['KLAZEMATICS', None]",[],https://twitter.com/teneikaask_you/status/1843...,[entrepreneur],"[Entrepreneur, Finance]",,"(entrepreneur,)","(Entrepreneur,)"
62,1791927729979273458,2024-05-18 20:23:57+00:00,2024-05-18 16:23:57,@whoisteezy @Jii_masunn They have a big appren...,0,0,whoisteezy,en,"<a href=""http://twitter.com/download/android"" ...","['whoisteezy', 'Jii_masunn', None]",[],https://twitter.com/teneikaask_you/status/1791...,"[job, upskilling]","[Data Analytics, AI]",,"(job, upskilling)","(AI,)"
63,1791927480917401847,2024-05-18 20:22:58+00:00,2024-05-18 16:22:58,@Jii_masunn The goal of this is to identify th...,7,1,Jii_masunn,en,"<a href=""http://twitter.com/download/android"" ...","['Jii_masunn', None]",[],https://twitter.com/teneikaask_you/status/1791...,"[bootcamp, upskilling]",[AI],,(),"(AI,)"
67,1791927049101115677,2024-05-18 20:21:15+00:00,2024-05-18 16:21:15,@50Pipz Had you taken any cyber courses before...,0,0,50Pipz,en,"<a href=""http://twitter.com/download/android"" ...","['50Pipz', None]",[],https://twitter.com/teneikaask_you/status/1791...,[upskilling],[Cybersecurity],,"(upskilling,)","(Cybersecurity,)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41857,1580932941596835846,2022-10-14 14:46:00+00:00,2022-10-14 10:46:00,Samsung has a FREE Innovation &amp; AI bootcam...,912,305,,en,"<a href=""https://mobile.twitter.com"" rel=""nofo...",[None],[],https://twitter.com/teneikaask_you/status/1580...,[bootcamp],"[AI, UX, Data]",https://t.co/fWbcjQnjuL,"(bootcamp,)","(AI, Data, UX)"
41869,1580701954799325184,2022-10-13 23:28:08+00:00,2022-10-13 19:28:08,RT @BlackInCyberCo1: BIC Red Team Development ...,0,0,,en,"<a href=""http://twitter.com/download/android"" ...","['BlackInCyberCo1', None]",[],https://twitter.com/teneikaask_you/status/1580...,[bootcamp],"[Data Analytics, Cybersecurity]",https://t.co/GdJFbwp7sq,"(bootcamp,)",()
41872,1580678359667060736,2022-10-13 21:54:23+00:00,2022-10-13 17:54:23,RT @teneikaask_you: This isn't specific to tec...,0,0,,en,"<a href=""http://twitter.com/download/android"" ...","['teneikaask_you', None]",[],https://twitter.com/teneikaask_you/status/1580...,[entrepreneur],"[Entrepreneur, Tech]",,"(entrepreneur,)","(Entrepreneur, Tech)"
41874,1580678334106591232,2022-10-13 21:54:17+00:00,2022-10-13 17:54:17,RT @teneikaask_you: If you are looking for gra...,0,0,,en,"<a href=""http://twitter.com/download/android"" ...","['teneikaask_you', None]",[],https://twitter.com/teneikaask_you/status/1580...,"[entrepreneur, bootcamp, upskilling]","[Entrepreneur, AI]",,"(bootcamp, entrepreneur, upskilling)","(AI, Entrepreneur)"


In [None]:
# prompt: df_with_predictions to csv name classified_tweets

# Assuming df_with_predictions is already defined as in your code

df_with_predictions.to_csv('/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/classified_tweets.csv', index=False)

In [None]:
df_filtered_with_predictions = df_filtered_with_predictions[[
    'created_at_est', 'full_text', 'favorite_count', 'retweet_count',
    'tweet_url', 'resource_type', 'career_area'
]]
df_filtered_with_predictions.to_csv('/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/classified_filtered_tweets.csv', index=False)

In [None]:
# Save the DataFrame as a JSON file
df_filtered_with_predictions.to_json('/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/classified_filtered_tweets.json', orient='records', lines=True)

**Suggested Improvements:**

**Increase Sample Diversity:** If possible, gather more labeled examples, especially for underrepresented categories in Career Area (e.g., "Finance," "Non-Tech").

**Data Augmentation:** For Career Area classes with low samples, consider synthetic data generation (e.g., paraphrasing techniques) to balance class distribution.

**Use Pre-trained Embeddings:** Switch to embeddings (e.g., BERT or TF-IDF with n-grams) to capture more context in tweets, which may help distinguish nuanced Career Area labels.

**Recommended Model:**

Model 3 is the most promising due to its multi-label approach and balanced performance across Resource Type and Career Area. Further tuning and potentially more data can improve Career Area classification for low-sample classes.

In [None]:
save_and_push_to_github("Twitter Classification Update")

# Analysis





### Performance Analysis:


# LinkedIn

In [None]:
import os

def save_and_push_to_github(commit_message="Auto-save from Google Colab"):
    # Add all changes
    os.system("git add .")
    # Commit changes
    os.system(f'git commit -m "{commit_message}"')
    # Push changes to the GitHub repository
    os.system("git push origin main")  # Use 'master' if your default branch is named 'master'

In [None]:
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def autosave(line, cell):
    # Execute the cell
    exec(cell)
    # After execution, save and push to GitHub
    save_and_push_to_github("Autosave after cell execution")

In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Define the file path (update this path to your file location in Google Drive)
csv_file_path = '/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/Shares.csv'

import pandas as pd
df = pd.read_csv(csv_file_path)
print("CSV file loaded successfully into a DataFrame.")
# Now you can work with the DataFrame 'df'
df.head() # Example: Display the first few rows of the DataFrame

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
CSV file loaded successfully into a DataFrame.


Unnamed: 0,Date,ShareLink,ShareCommentary,SharedUrl,MediaUrl,Visibility
0,2024-11-10 20:18:59,https://www.linkedin.com/feed/update/urn%3Ali%...,I find it very odd that LinkedIn speaks about ...,,,MEMBER_NETWORK
1,2024-10-15 20:22:46,https://www.linkedin.com/feed/update/urn%3Ali%...,Join me at Techsgiving!!! I'll be presenting a...,,,MEMBER_NETWORK
2,2024-06-14 21:27:42,https://www.linkedin.com/feed/update/urn%3Ali%...,Did I just find out that I've been COINED?? 🤗,,,MEMBER_NETWORK
3,2024-06-10 13:04:21,https://www.linkedin.com/feed/update/urn%3Ali%...,Really enjoyed giving this talk with DON IT Ea...,,,MEMBER_NETWORK
4,2024-05-09 18:52:49,https://www.linkedin.com/feed/update/urn%3Ali%...,You have just one more day to sign up!!! https...,,,MEMBER_NETWORK


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import re


# Step 0: Filter to items with a created date of October 1, 2019 or later
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Assigning labels for `resource_type` and `career_area` based on keywords
resource_labels = {
    "entrepreneur": ["entrepreneur", "founder", "business", "startup", "entrepreneurship", "funding","accelerator"],
    "scholarship": ["scholarship"],
    "bootcamp": ["bootcamp"],
    "resume": ["resume"],
    "apprenticeship": ["apprenticeship"],
    "job": ["job opening", "hiring", "position"],
    "upskilling": ["upskill", "learning", "training"],
    "conferences": ["conference", "event", "seminar"],
    "general_discussion": ["discussion", "opinion", "thoughts"]
}

career_labels = {
    "Entrepreneur": ["entrepreneur", "founder", "business", "startup", "entrepreneurship", "funding","accelerator"],
    "Data Analytics": ["data analytics", "data analysis"],
    "AI": ["artificial intelligence", "ai"],
    "Data Engineering": ["data engineering", "data pipeline"],
    "ServiceNow": ["servicenow"],
    "Salesforce": ["salesforce"],
    "Cloud": ["cloud", "aws", "azure", "gcp"],
    "UX": ["ux", "user experience"],
    "Product Management": ["product management"],
    "Product Design": ["product design"],
    "Project Management": ["project management", "pmp"],
    "Digital Marketing": ["digital marketing"],
    "Software & Systems Engineering": ["software engineering", "systems engineering"],
    "Data": ["data"],
    "Software or Web Development": ["software development", "web development"],
    "Tech Sales": ["tech sales"]
}

# Assign labels based on keywords
def assign_labels(text, label_dict):
    for label, keywords in label_dict.items():
        if any(keyword.lower() in str(text).lower() for keyword in keywords):
            return label
    return None

df['resource_type'] = df['ShareCommentary'].apply(lambda x: assign_labels(x, resource_labels))
df['career_area'] = df['ShareCommentary'].apply(lambda x: assign_labels(x, career_labels))

# Extract links to a new field
df['resource_link'] = df['ShareCommentary'].str.extract(r'(https?://\S+)')

# Prepare data for model training - drop rows without labels
labeled_df = df.dropna(subset=['resource_type', 'career_area'])


In [None]:
labeled_df.head()

Unnamed: 0,Date,ShareLink,ShareCommentary,SharedUrl,MediaUrl,Visibility,resource_type,career_area,resource_link
1,2024-10-15 20:22:46,https://www.linkedin.com/feed/update/urn%3Ali%...,Join me at Techsgiving!!! I'll be presenting a...,,,MEMBER_NETWORK,conferences,AI,https://lnkd.in/gkGmZQ3Y
5,2024-04-30 04:05:50,https://www.linkedin.com/feed/update/urn%3Ali%...,Build your next data science project with me n...,,,MEMBER_NETWORK,upskilling,Data Analytics,"https://lnkd.in/eaDTY498"""
6,2024-04-23 02:14:47,https://www.linkedin.com/feed/update/urn%3Ali%...,Some cool opportunities I discovered in the pa...,,,MEMBER_NETWORK,entrepreneur,Entrepreneur,"https://lnkd.in/e7HwcmVg"""
7,2024-04-12 13:47:38,https://www.linkedin.com/feed/update/urn%3Ali%...,Yesterday I hosted a Shadow Session with Click...,,,MEMBER_NETWORK,upskilling,AI,"https://lnkd.in/eTTtXuPb"""
15,2024-02-27 18:09:40,https://www.linkedin.com/feed/update/urn%3Ali%...,If you want to grow a career in Data as a Data...,,,MEMBER_NETWORK,resume,AI,"https://lnkd.in/efxxJnqA"""


In [None]:
# Step 2: Text vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X = tfidf.fit_transform(labeled_df['ShareCommentary'])

# Encode the labels
y_resource = labeled_df['resource_type']
y_career = labeled_df['career_area']

# Split data for resource type prediction
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X, y_resource, test_size=0.2, random_state=42)
X_train_career, X_test_career, y_train_career, y_test_career = train_test_split(X, y_career, test_size=0.2, random_state=42)

# Step 3: Train the classifier for resource type
resource_model = LogisticRegression(max_iter=200)
resource_model.fit(X_train_res, y_train_res)

# Train the classifier for career area
career_model = LogisticRegression(max_iter=200)
career_model.fit(X_train_career, y_train_career)

# Step 4: Evaluate both models
y_pred_res = resource_model.predict(X_test_res)
y_pred_career = career_model.predict(X_test_career)

# Classification reports
print("Resource Type Classification Report")
print(classification_report(y_test_res, y_pred_res))
print(f"Resource Type Accuracy: {accuracy_score(y_test_res, y_pred_res):.2f}")

print("\nCareer Area Classification Report")
print(classification_report(y_test_career, y_pred_career))
print(f"Career Area Accuracy: {accuracy_score(y_test_career, y_pred_career):.2f}")


Resource Type Classification Report
              precision    recall  f1-score   support

    bootcamp       0.00      0.00      0.00         9
 conferences       0.00      0.00      0.00         6
entrepreneur       0.37      0.87      0.52        15
         job       1.00      0.14      0.25         7
      resume       0.00      0.00      0.00         6
 scholarship       0.75      0.30      0.43        10
  upskilling       0.56      0.90      0.69        21

    accuracy                           0.49        74
   macro avg       0.38      0.32      0.27        74
weighted avg       0.43      0.49      0.38        74

Resource Type Accuracy: 0.49

Career Area Classification Report
                precision    recall  f1-score   support

            AI       0.55      1.00      0.71        36
         Cloud       0.00      0.00      0.00         5
          Data       0.00      0.00      0.00         8
Data Analytics       1.00      0.20      0.33        10
  Entrepreneur       1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# prompt: show this in a df y_test_career, y_pred_career
pd.DataFrame({'y_test_career': y_test_career, 'y_pred_career': y_pred_career})

Unnamed: 0,y_test_career,y_pred_career
662,AI,AI
142,Data Analytics,AI
51,AI,AI
1120,Cloud,AI
299,AI,AI
...,...,...
370,Entrepreneur,Entrepreneur
404,Entrepreneur,Entrepreneur
829,AI,AI
1108,Cloud,AI


In [None]:
# Assigning labels for `resource_type` and `career_area` based on keywords
resource_labels = {
    "entrepreneur": ["entrepreneur", "founder", "business", "startup", "entrepreneurship", "funding","accelerator"],
    "scholarship": ["scholarship"],
    "bootcamp": ["bootcamp", "program", "academy", "certificate"],
    "resume": ["resume", "cv", "curriculum vitae"],
    "job": ["job opening", "hiring", "position", "apprenticeship", "internship", "apprentice", "career opportunity", "job posting"],
    "upskilling": ["upskill", "learning", "training", "skill development", "course", "certification"],
    "conferences": ["conference", "event", "seminar", "webinar", "meetup", "workshop", "fireside chat", "fireside", "panel", "summit"],
    "general_discussion": ["discussion", "opinion", "thoughts", "general", "comment", "feedback", "insight"]
}

career_labels = {
    "Entrepreneur": ["entrepreneur", "founder", "business", "startup", "entrepreneurship", "funding","accelerator"],
    "Data Analytics": ["data analytics", "data analysis", "business intelligence", "BI", "data analyst"],
    "AI": ["artificial intelligence", "ai", "machine learning", "ML", "deep learning"],
    "Data Engineering": ["data engineering", "data pipeline", "data engineer", "data infrastructure", "ETL", "big data"],
    "ServiceNow": ["servicenow"],
    "Students": ["student", "students", "early career", "student"],
    "Salesforce": ["salesforce", "crm"],
    "Cloud": ["cloud", "aws", "azure", "gcp", "oracle", "cloud computing"],
    "Cybersecurity": ["cyber", "cybersecurity", "networking", "linux", "soc", "cyber security", "security", "penetration testing"],
    "UX": ["ux", "user experience", "ui", "design thinking", "interface", "human centered", "design", "ui"],
    "Product Management": ["product management", "product manager", "product", "agile"],
    "Product Design": ["product design", "product development"],
    "Project Management": ["project management", "pmp", "project planning"],
    "Digital Marketing": ["digital marketing", "social media", "seo", "content marketing"],
    "Software & Systems Engineering": ["software engineering", "systems engineering", "embedded systems", "systems architect"],
    "Data": ["data", "data science", "statistics"],
    "Software or Web Development": ["software development", "web development", "devops", "frontend", "backend", "full stack", "javascript", "react"],
    "Tech Sales": ["tech sales", "technical sales", "business development"],
    "Finance": ["finance", "financial", "accounting", "investment", "capital"],
    "Tech": ["tech", "technical", "technology", "FAANG", "Silicon Valley"],
    "Non-Tech": ["nontech", "non-tech", "non technical"],
    "GovTech": ["govtech", "gov-tech", "irs", "fema", "dod", "digital corps", "coding it forward", "digital service", "gsa", "tts", "public sector"]
}

# Assign labels based on keywords
def assign_labels(text, label_dict):
    for label, keywords in label_dict.items():
        if any(keyword.lower() in str(text).lower() for keyword in keywords):
            return label
    return None

df['resource_type'] = df['ShareCommentary'].apply(lambda x: assign_labels(x, resource_labels))
df['career_area'] = df['ShareCommentary'].apply(lambda x: assign_labels(x, career_labels))

# Extract links to a new field
df['resource_link'] = df['ShareCommentary'].str.extract(r'(https?://\S+)')

# Prepare data for model training - drop rows without labels
labeled_df = df.dropna(subset=['resource_type', 'career_area'])


In [None]:
labeled_df.head()

Unnamed: 0,Date,ShareLink,ShareCommentary,SharedUrl,MediaUrl,Visibility,resource_type,career_area,resource_link
1,2024-10-15 20:22:46,https://www.linkedin.com/feed/update/urn%3Ali%...,Join me at Techsgiving!!! I'll be presenting a...,,,MEMBER_NETWORK,conferences,AI,https://lnkd.in/gkGmZQ3Y
5,2024-04-30 04:05:50,https://www.linkedin.com/feed/update/urn%3Ali%...,Build your next data science project with me n...,,,MEMBER_NETWORK,upskilling,Data Analytics,"https://lnkd.in/eaDTY498"""
6,2024-04-23 02:14:47,https://www.linkedin.com/feed/update/urn%3Ali%...,Some cool opportunities I discovered in the pa...,,,MEMBER_NETWORK,entrepreneur,Entrepreneur,"https://lnkd.in/e7HwcmVg"""
7,2024-04-12 13:47:38,https://www.linkedin.com/feed/update/urn%3Ali%...,Yesterday I hosted a Shadow Session with Click...,,,MEMBER_NETWORK,upskilling,AI,"https://lnkd.in/eTTtXuPb"""
13,2024-03-11 16:54:27,https://www.linkedin.com/feed/update/urn%3Ali%...,If you've been interested in starting your car...,https://www.correlation-one.com/dod-cyber-sent...,,MEMBER_NETWORK,job,Data Analytics,"https://lnkd.in/e7HwcmVg"""


In [None]:

# Step 2: Text vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X = tfidf.fit_transform(labeled_df['ShareCommentary'])

# Encode the labels
y_resource = labeled_df['resource_type']
y_career = labeled_df['career_area']

# Split data for resource type prediction
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X, y_resource, test_size=0.2, random_state=42)
X_train_career, X_test_career, y_train_career, y_test_career = train_test_split(X, y_career, test_size=0.2, random_state=42)

# Step 3: Train the classifier for resource type
resource_model = LogisticRegression(max_iter=200)
resource_model.fit(X_train_res, y_train_res)

# Train the classifier for career area
career_model = LogisticRegression(max_iter=200)
career_model.fit(X_train_career, y_train_career)

# Step 4: Evaluate both models
y_pred_res = resource_model.predict(X_test_res)
y_pred_career = career_model.predict(X_test_career)

# Classification reports
print("Resource Type Classification Report")
print(classification_report(y_test_res, y_pred_res))
print(f"Resource Type Accuracy: {accuracy_score(y_test_res, y_pred_res):.2f}")

print("\nCareer Area Classification Report")
print(classification_report(y_test_career, y_pred_career))
print(f"Career Area Accuracy: {accuracy_score(y_test_career, y_pred_career):.2f}")


Resource Type Classification Report
                    precision    recall  f1-score   support

          bootcamp       0.44      1.00      0.61        35
       conferences       0.50      0.17      0.25         6
      entrepreneur       0.92      0.58      0.71        19
general_discussion       0.00      0.00      0.00         3
               job       1.00      0.08      0.14        13
            resume       0.00      0.00      0.00         2
       scholarship       1.00      0.50      0.67        12
        upskilling       1.00      0.23      0.38        13

          accuracy                           0.55       103
         macro avg       0.61      0.32      0.34       103
      weighted avg       0.72      0.55      0.50       103

Resource Type Accuracy: 0.55

Career Area Classification Report
                  precision    recall  f1-score   support

              AI       0.40      0.81      0.54        26
           Cloud       0.00      0.00      0.00         3
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# prompt: show this in a df y_test_career, y_pred_career
pd.DataFrame({'y_test_career': y_test_career, 'y_pred_career': y_pred_career})

Unnamed: 0,y_test_career,y_pred_career
871,AI,Data Analytics
1411,Entrepreneur,Entrepreneur
1174,AI,AI
446,Data Analytics,Data Analytics
1426,Entrepreneur,Entrepreneur
...,...,...
1033,AI,AI
487,Data Analytics,Data Analytics
837,Data Analytics,AI
1551,Cybersecurity,AI


# multiclass

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer

# Function to assign multiple labels based on keywords
def assign_multi_labels(text, label_dict):
    labels = []
    for label, keywords in label_dict.items():
        if any(keyword.lower() in str(text).lower() for keyword in keywords):
            labels.append(label)
    return labels if labels else ["general_discussion"]

# Load and preprocess the data
file_path = '/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/Shares.csv'
df = pd.read_csv(file_path)

# Filter to items with a created date of October 1, 2019 or later
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')


# Apply multi-label assignment functions
df['resource_type'] = df['ShareCommentary'].apply(lambda x: assign_multi_labels(x, resource_labels))
df['career_area'] = df['ShareCommentary'].apply(lambda x: assign_multi_labels(x, career_labels))

# Convert labels to multi-label binary format using MultiLabelBinarizer
mlb_resource = MultiLabelBinarizer()
mlb_career = MultiLabelBinarizer()

y_resource = mlb_resource.fit_transform(df['resource_type'])
y_career = mlb_career.fit_transform(df['career_area'])

# Extract links to a new field
df['resource_link'] = df['ShareCommentary'].str.extract(r'(https?://\S+)')

In [None]:
df

Unnamed: 0,Date,ShareLink,ShareCommentary,SharedUrl,MediaUrl,Visibility,resource_type,career_area,resource_link
0,2024-11-10 20:18:59,https://www.linkedin.com/feed/update/urn%3Ali%...,I find it very odd that LinkedIn speaks about ...,,,MEMBER_NETWORK,[general_discussion],"[AI, Data]",
1,2024-10-15 20:22:46,https://www.linkedin.com/feed/update/urn%3Ali%...,Join me at Techsgiving!!! I'll be presenting a...,,,MEMBER_NETWORK,[conferences],"[AI, Cybersecurity, Tech]",https://lnkd.in/gkGmZQ3Y
2,2024-06-14 21:27:42,https://www.linkedin.com/feed/update/urn%3Ali%...,Did I just find out that I've been COINED?? 🤗,,,MEMBER_NETWORK,[general_discussion],[general_discussion],
3,2024-06-10 13:04:21,https://www.linkedin.com/feed/update/urn%3Ali%...,Really enjoyed giving this talk with DON IT Ea...,,,MEMBER_NETWORK,[general_discussion],"[Data Analytics, AI, Data, Finance, Tech, GovT...",https://lnkd.in/ebvFFDhh
4,2024-05-09 18:52:49,https://www.linkedin.com/feed/update/urn%3Ali%...,You have just one more day to sign up!!! https...,,,MEMBER_NETWORK,[general_discussion],[general_discussion],https://lnkd.in/eaDTY498
...,...,...,...,...,...,...,...,...,...
1598,2015-01-25 20:54:33,https://www.linkedin.com/feed/update/urn%3Ali%...,,http://99u.com/articles/36393/how-to-find-a-me...,,MEMBER_NETWORK,[general_discussion],[general_discussion],
1599,2014-12-29 16:06:15,https://www.linkedin.com/feed/update/urn%3Ali%...,,,,MEMBER_NETWORK,[general_discussion],[general_discussion],
1600,2014-09-29 15:15:25,https://www.linkedin.com/feed/update/urn%3Ali%...,,,,MEMBER_NETWORK,[general_discussion],[general_discussion],
1601,2013-08-13 23:27:37,https://www.linkedin.com/feed/update/urn%3Ali%...,"""It's not about having the skill to do somethi...",,,MEMBER_NETWORK,[general_discussion],[general_discussion],


In [None]:
# Text vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
# Replace NaN values with an empty string before vectorization
df['ShareCommentary'] = df['ShareCommentary'].fillna('')  # Replace NaN with empty string

X = tfidf.fit_transform(df['ShareCommentary'])

# Split data for resource type prediction
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X, y_resource, test_size=0.2, random_state=42)
X_train_career, X_test_career, y_train_career, y_test_career = train_test_split(X, y_career, test_size=0.2, random_state=42)

# Train multi-label classifier for resource type using Logistic Regression
resource_model = MultiOutputClassifier(LogisticRegression(max_iter=200))
resource_model.fit(X_train_res, y_train_res)

# Train multi-label classifier for career area using Logistic Regression
career_model = MultiOutputClassifier(LogisticRegression(max_iter=200))
career_model.fit(X_train_career, y_train_career)

# Evaluate both models
y_pred_res = resource_model.predict(X_test_res)
y_pred_career = career_model.predict(X_test_career)

# Display classification reports
print("Resource Type Classification Report")
print(classification_report(y_test_res, y_pred_res, target_names=mlb_resource.classes_))

print("\nCareer Area Classification Report")
print(classification_report(y_test_career, y_pred_career, target_names=mlb_career.classes_))


Resource Type Classification Report
                    precision    recall  f1-score   support

          bootcamp       0.94      0.76      0.84        62
       conferences       1.00      0.14      0.25        35
      entrepreneur       1.00      0.04      0.08        23
general_discussion       0.88      0.95      0.91       222
               job       1.00      0.18      0.31        38
            resume       0.00      0.00      0.00         7
       scholarship       1.00      0.08      0.14        13
        upskilling       1.00      0.75      0.85        63

         micro avg       0.91      0.69      0.78       463
         macro avg       0.85      0.36      0.42       463
      weighted avg       0.92      0.69      0.72       463
       samples avg       0.84      0.79      0.80       463


Career Area Classification Report
                                precision    recall  f1-score   support

                            AI       0.97      0.67      0.79        88
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
# prompt: show the df with the predicted labels

# Predict labels for the entire dataset
y_pred_res_all = resource_model.predict(X)
y_pred_career_all = career_model.predict(X)

# Convert predicted labels back to original format
predicted_resource_labels = mlb_resource.inverse_transform(y_pred_res_all)
predicted_career_labels = mlb_career.inverse_transform(y_pred_career_all)

# Add predicted labels to the DataFrame
df['predicted_resource_labels'] = predicted_resource_labels
df['predicted_career_labels'] = predicted_career_labels

# Show the DataFrame with predicted labels
df[['ShareCommentary', 'predicted_resource_labels', 'predicted_career_labels']]

Unnamed: 0,ShareCommentary,predicted_resource_labels,predicted_career_labels
0,I find it very odd that LinkedIn speaks about ...,"(general_discussion,)","(AI, Data)"
1,Join me at Techsgiving!!! I'll be presenting a...,(),()
2,Did I just find out that I've been COINED?? 🤗,"(general_discussion,)","(general_discussion,)"
3,Really enjoyed giving this talk with DON IT Ea...,"(general_discussion,)",()
4,You have just one more day to sign up!!! https...,"(general_discussion,)","(general_discussion,)"
...,...,...,...
1598,,"(general_discussion,)","(general_discussion,)"
1599,,"(general_discussion,)","(general_discussion,)"
1600,,"(general_discussion,)","(general_discussion,)"
1601,"""It's not about having the skill to do somethi...","(general_discussion,)","(general_discussion,)"


## multilabel

In [26]:
# Vectorize the text
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X = tfidf.fit_transform(df['ShareCommentary'])

# Keep tweet_id as a separate variable
tweet_ids = df['ShareLink']

# Split data for both resource and career areas, ensuring tweet_id stays aligned
X_train_res, X_test_res, y_train_res, y_test_res, tweet_ids_train_res, tweet_ids_test_res = train_test_split(
    X, y_resource, tweet_ids, test_size=0.2, random_state=42
)
X_train_career, X_test_career, y_train_career, y_test_career, tweet_ids_train_career, tweet_ids_test_career = train_test_split(
    X, y_career, tweet_ids, test_size=0.2, random_state=42
)

# Train multi-label classifier for resource type using Logistic Regression
resource_model = MultiOutputClassifier(LogisticRegression(max_iter=200))
resource_model.fit(X_train_res, y_train_res)

# Train multi-label classifier for career area using Logistic Regression
career_model = MultiOutputClassifier(LogisticRegression(max_iter=200))
career_model.fit(X_train_career, y_train_career)

# Predict on the test set
y_pred_res = resource_model.predict(X_test_res)
y_pred_career = career_model.predict(X_test_career)

# Combine predictions with tweet IDs for easy joining
resource_results = pd.DataFrame(y_pred_res, columns=mlb_resource.classes_)
resource_results['ShareLink'] = tweet_ids_test_res.values
career_results = pd.DataFrame(y_pred_career, columns=mlb_career.classes_)
career_results['ShareLink'] = tweet_ids_test_career.values

# Display classification reports for both models
print("Resource Type Classification Report")
print(classification_report(y_test_res, y_pred_res, target_names=mlb_resource.classes_))

print("\nCareer Area Classification Report")
print(classification_report(y_test_career, y_pred_career, target_names=mlb_career.classes_))

Resource Type Classification Report
                    precision    recall  f1-score   support

          bootcamp       0.94      0.76      0.84        62
       conferences       1.00      0.14      0.25        35
      entrepreneur       1.00      0.04      0.08        23
general_discussion       0.88      0.95      0.91       222
               job       1.00      0.18      0.31        38
            resume       0.00      0.00      0.00         7
       scholarship       1.00      0.08      0.14        13
        upskilling       1.00      0.75      0.85        63

         micro avg       0.91      0.69      0.78       463
         macro avg       0.85      0.36      0.42       463
      weighted avg       0.92      0.69      0.72       463
       samples avg       0.84      0.79      0.80       463


Career Area Classification Report
                                precision    recall  f1-score   support

                            AI       0.97      0.67      0.79        88
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
# Final combined results with tweet_id to facilitate joining
final_results = pd.merge(resource_results, career_results, on="ShareLink", suffixes=('_resource', '_career'))
final_results.head()

Unnamed: 0,bootcamp,conferences,entrepreneur,general_discussion_resource,job,resume,scholarship,upskilling,ShareLink,AI,...,Project Management,Salesforce,ServiceNow,Software & Systems Engineering,Software or Web Development,Students,Tech,Tech Sales,UX,general_discussion_career
0,0,0,0,1,0,0,0,0,https://www.linkedin.com/feed/update/urn%3Ali%...,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,https://www.linkedin.com/feed/update/urn%3Ali%...,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,1,0,0,0,0,https://www.linkedin.com/feed/update/urn%3Ali%...,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,1,0,0,0,0,https://www.linkedin.com/feed/update/urn%3Ali%...,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,1,0,0,0,0,https://www.linkedin.com/feed/update/urn%3Ali%...,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
# Predict labels for the entire original dataset without filtering
y_pred_res_all = resource_model.predict(X)
y_pred_career_all = career_model.predict(X)

# Convert predicted labels back to their original format
predicted_resource_labels = mlb_resource.inverse_transform(y_pred_res_all)
predicted_career_labels = mlb_career.inverse_transform(y_pred_career_all)

# Add predicted labels back to the original DataFrame
df['predicted_resource_labels'] = predicted_resource_labels
df['predicted_career_labels'] = predicted_career_labels

# Display the DataFrame with tweet_id, full_text, and predicted labels
df_with_predictions = df[['ShareLink', 'ShareCommentary', 'predicted_resource_labels', 'predicted_career_labels']]
df_with_predictions

Unnamed: 0,ShareLink,ShareCommentary,predicted_resource_labels,predicted_career_labels
0,https://www.linkedin.com/feed/update/urn%3Ali%...,I find it very odd that LinkedIn speaks about ...,"(general_discussion,)","(AI, Data)"
1,https://www.linkedin.com/feed/update/urn%3Ali%...,Join me at Techsgiving!!! I'll be presenting a...,(),()
2,https://www.linkedin.com/feed/update/urn%3Ali%...,Did I just find out that I've been COINED?? 🤗,"(general_discussion,)","(general_discussion,)"
3,https://www.linkedin.com/feed/update/urn%3Ali%...,Really enjoyed giving this talk with DON IT Ea...,"(general_discussion,)",()
4,https://www.linkedin.com/feed/update/urn%3Ali%...,You have just one more day to sign up!!! https...,"(general_discussion,)","(general_discussion,)"
...,...,...,...,...
1598,https://www.linkedin.com/feed/update/urn%3Ali%...,,"(general_discussion,)","(general_discussion,)"
1599,https://www.linkedin.com/feed/update/urn%3Ali%...,,"(general_discussion,)","(general_discussion,)"
1600,https://www.linkedin.com/feed/update/urn%3Ali%...,,"(general_discussion,)","(general_discussion,)"
1601,https://www.linkedin.com/feed/update/urn%3Ali%...,"""It's not about having the skill to do somethi...","(general_discussion,)","(general_discussion,)"


In [32]:
# Filter out rows where 'general_discussion' appears in predicted labels
df_filtered = df[~df['predicted_resource_labels'].apply(lambda x: 'general_discussion' in x)].copy()
df_filtered_with_predictions = df_filtered[~df_filtered['predicted_career_labels'].apply(lambda x: 'general_discussion' in x)].copy()

# Display tweet_id, full_text, and predicted labels without general_discussion
df_filtered_with_predictions[['ShareLink', 'ShareCommentary', 'predicted_resource_labels', 'predicted_career_labels']]
df_filtered_with_predictions

Unnamed: 0,Date,ShareLink,ShareCommentary,SharedUrl,MediaUrl,Visibility,resource_type,career_area,resource_link,predicted_resource_labels,predicted_career_labels
1,2024-10-15 20:22:46,https://www.linkedin.com/feed/update/urn%3Ali%...,Join me at Techsgiving!!! I'll be presenting a...,,,MEMBER_NETWORK,[conferences],"[AI, Cybersecurity, Tech]",https://lnkd.in/gkGmZQ3Y,(),()
5,2024-04-30 04:05:50,https://www.linkedin.com/feed/update/urn%3Ali%...,Build your next data science project with me n...,,,MEMBER_NETWORK,[upskilling],"[Data Analytics, AI, Cybersecurity, UX, Data, ...","https://lnkd.in/eaDTY498""","(upskilling,)","(AI, Data, Data Analytics, Tech, UX)"
6,2024-04-23 02:14:47,https://www.linkedin.com/feed/update/urn%3Ali%...,Some cool opportunities I discovered in the pa...,,,MEMBER_NETWORK,"[entrepreneur, scholarship, bootcamp, job, ups...","[Entrepreneur, Data Analytics, AI, Cloud, Cybe...","https://lnkd.in/e7HwcmVg""","(bootcamp, upskilling)","(AI, Cloud, Cybersecurity, Data, Data Analytic..."
7,2024-04-12 13:47:38,https://www.linkedin.com/feed/update/urn%3Ali%...,Yesterday I hosted a Shadow Session with Click...,,,MEMBER_NETWORK,[upskilling],"[AI, Cybersecurity, UX, Tech]","https://lnkd.in/eTTtXuPb""","(upskilling,)","(AI, UX)"
13,2024-03-11 16:54:27,https://www.linkedin.com/feed/update/urn%3Ali%...,If you've been interested in starting your car...,https://www.correlation-one.com/dod-cyber-sent...,,MEMBER_NETWORK,[job],"[Data Analytics, AI, Cybersecurity, UX, Tech, ...","https://lnkd.in/e7HwcmVg""",(),"(AI, UX)"
...,...,...,...,...,...,...,...,...,...,...,...
1380,2019-01-28 13:06:37,https://www.linkedin.com/feed/update/urn%3Ali%...,"The Flatiron Opportunity Scholarship""\n""OHUB@F...",https://go.flatironschool.com/the-flatiron-opp...,,MEMBER_NETWORK,"[scholarship, upskilling]","[Students, Cybersecurity, UX, Software & Syste...",https://lnkd.in/erbdVdA,(),()
1381,2019-01-27 20:08:07,https://www.linkedin.com/feed/update/urn%3Ali%...,"The Flatiron Opportunity Scholarship""\n""OHUB@F...",,,MEMBER_NETWORK,"[scholarship, upskilling]","[Students, Cybersecurity, UX, Software & Syste...",https://lnkd.in/erbdVdA,(),()
1411,2018-11-13 02:07:58,https://www.linkedin.com/feed/update/urn%3Ali%...,Women have created the permission to be vulner...,,,MEMBER_NETWORK,"[entrepreneur, conferences]","[Entrepreneur, UX]",,(),()
1426,2018-09-29 16:39:32,https://www.linkedin.com/feed/update/urn%3Ali%...,Grace Hopper is now a moment in time I will re...,,,MEMBER_NETWORK,"[entrepreneur, conferences]","[Entrepreneur, Students, Cybersecurity, Produc...",,(),()


In [33]:
save_and_push_to_github("Twitter Classification Update")