<a href="https://colab.research.google.com/github/TeneikaAskew/tResourceGPT/blob/main/Twitter_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Twitter

In [None]:
import os

def save_and_push_to_github(commit_message="Auto-save from Google Colab"):
    # Add all changes
    os.system("git add .")
    # Commit changes
    os.system(f'git commit -m "{commit_message}"')
    # Push changes to the GitHub repository
    os.system("git push origin main")  # Use 'master' if your default branch is named 'master'

In [None]:
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def autosave(line, cell):
    # Execute the cell
    exec(cell)
    # After execution, save and push to GitHub
    save_and_push_to_github("Autosave after cell execution")


## ETL

In [None]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Define the file path (update this path to your file location in Google Drive)
js_file_path = '/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/tweets.js'  # Update with the correct path
csv_file_path = '/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/converted_tweets.csv'

# Step 3: Load and parse the JSON directly from the file
import json
from datetime import datetime
import pytz
import pandas as pd

with open(js_file_path, 'r', encoding='utf-8') as file:
    content = file.read().split('=', 1)[-1].strip()  # Remove 'window.YTD.tweets.part0 ='
    data = json.loads(content.rstrip(';'))  # Remove trailing semicolon if present

# Step 4: Process the JSON data to extract relevant tweet information
tweet_data = []
for tweet in data:
    tweet_info = tweet.get('tweet', {})

    # Extract specific fields from each tweet
    created_at_utc = tweet_info.get("created_at")
    if created_at_utc:
        # Convert created_at to EST
        utc_time = datetime.strptime(created_at_utc, '%a %b %d %H:%M:%S +0000 %Y')
        est_time = utc_time.replace(tzinfo=pytz.utc).astimezone(pytz.timezone('US/Eastern'))
        est_time_str = est_time.strftime('%Y-%m-%d %H:%M:%S')
    else:
        est_time_str = None

    flattened_tweet = {
        "tweet_id": tweet_info.get("id_str"),
        "created_at": created_at_utc,
        "created_at_est": est_time_str,  # New column with EST time
        "full_text": tweet_info.get("full_text"),
        "favorite_count": tweet_info.get("favorite_count"),
        "retweet_count": tweet_info.get("retweet_count"),
        "in_reply_to_screen_name": tweet_info.get("in_reply_to_screen_name"),
        "lang": tweet_info.get("lang"),
        "source": tweet_info.get("source"),
        "user_mentions": [
            mention.get("screen_name") for mention in tweet_info.get("entities", {}).get("user_mentions", [])
        ]
    }
    tweet_data.append(flattened_tweet)

# Convert to DataFrame
df = pd.DataFrame(tweet_data)

# Step 5: Save the DataFrame as CSV
df.to_csv(csv_file_path, index=False)

print(f'File successfully converted and saved at: {csv_file_path}')

# Load and preprocess the data
file_path = '/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/converted_tweets.csv'
df = pd.read_csv(file_path)

## Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import re


# Step 0: Filter to items with a created date of October 1, 2019 or later
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df = df[df['created_at'] >= '2019-10-01']

# Assigning labels for `resource_type` and `career_area` based on keywords
resource_labels = {
    "entrepreneur": ["entrepreneur", "founder", "business", "startup", "entrepreneurship", "funding","accelerator"],
    "scholarship": ["scholarship"],
    "bootcamp": ["bootcamp"],
    "resume": ["resume"],
    "apprenticeship": ["apprenticeship"],
    "job": ["job opening", "hiring", "position"],
    "upskilling": ["upskill", "learning", "training"],
    "conferences": ["conference", "event", "seminar"],
    "general_discussion": ["discussion", "opinion", "thoughts"]
}

career_labels = {
    "Entrepreneur": ["entrepreneur", "founder", "business", "startup", "entrepreneurship", "funding","accelerator"],
    "Data Analytics": ["data analytics", "data analysis"],
    "AI": ["artificial intelligence", "ai"],
    "Data Engineering": ["data engineering", "data pipeline"],
    "ServiceNow": ["servicenow"],
    "Salesforce": ["salesforce"],
    "Cloud": ["cloud", "aws", "azure", "gcp"],
    "UX": ["ux", "user experience"],
    "Product Management": ["product management"],
    "Product Design": ["product design"],
    "Project Management": ["project management", "pmp"],
    "Digital Marketing": ["digital marketing"],
    "Software & Systems Engineering": ["software engineering", "systems engineering"],
    "Data": ["data"],
    "Software or Web Development": ["software development", "web development"],
    "Tech Sales": ["tech sales"]
}

# Assign labels based on keywords
def assign_labels(text, label_dict):
    for label, keywords in label_dict.items():
        if any(keyword.lower() in str(text).lower() for keyword in keywords):
            return label
    return None

df['resource_type'] = df['full_text'].apply(lambda x: assign_labels(x, resource_labels))
df['career_area'] = df['full_text'].apply(lambda x: assign_labels(x, career_labels))

# Extract links to a new field
df['resource_link'] = df['full_text'].str.extract(r'(https?://\S+)')

# Prepare data for model training - drop rows without labels
labeled_df = df.dropna(subset=['resource_type', 'career_area'])


In [None]:
# Step 2: Text vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X = tfidf.fit_transform(labeled_df['full_text'])

# Encode the labels
y_resource = labeled_df['resource_type']
y_career = labeled_df['career_area']

# Split data for resource type prediction
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X, y_resource, test_size=0.2, random_state=42)
X_train_career, X_test_career, y_train_career, y_test_career = train_test_split(X, y_career, test_size=0.2, random_state=42)

# Step 3: Train the classifier for resource type
resource_model = LogisticRegression(max_iter=200)
resource_model.fit(X_train_res, y_train_res)

# Train the classifier for career area
career_model = LogisticRegression(max_iter=200)
career_model.fit(X_train_career, y_train_career)

# Step 4: Evaluate both models
y_pred_res = resource_model.predict(X_test_res)
y_pred_career = career_model.predict(X_test_career)

# Classification reports
print("Resource Type Classification Report")
print(classification_report(y_test_res, y_pred_res))
print(f"Resource Type Accuracy: {accuracy_score(y_test_res, y_pred_res):.2f}")

print("\nCareer Area Classification Report")
print(classification_report(y_test_career, y_pred_career))
print(f"Career Area Accuracy: {accuracy_score(y_test_career, y_pred_career):.2f}")


**Accuracy:** 87% for Resource Type, 79% for Career Area.

**Strengths:** High precision for specific categories like "entrepreneur" and "upskilling" in Resource Type.

**Weaknesses:** Low recall for less common categories, with some classes (like "general_discussion" in Career Area) showing zero recall.

**Interpretation:** This model performs well for high-frequency labels but struggles with underrepresented categories, especially in Career Area.

In [None]:
# prompt: show this in a df y_test_career, y_pred_career
pd.DataFrame({'y_test_career': y_test_career, 'y_pred_career': y_pred_career})

## Expansion of Labels

Revision of labels to consistently refine after each run

In [None]:
# Assigning labels for `resource_type` and `career_area` based on keywords
resource_labels = {
    "entrepreneur": ["entrepreneur", "founder", "business", "startup", "entrepreneurship", "funding","accelerator"],
    "scholarship": ["scholarship"],
    "bootcamp": ["bootcamp", "program", "academy", "certificate"],
    "resume": ["resume", "cv", "curriculum vitae"],
    "job": ["job opening", "hiring", "position", "apprenticeship", "internship", "apprentice", "career opportunity", "job posting"],
    "upskilling": ["upskill", "learning", "training", "skill development", "course", "certification"],
    "conferences": ["conference", "event", "seminar", "webinar", "meetup", "workshop", "fireside chat", "fireside", "panel", "summit"],
    "general_discussion": ["discussion", "opinion", "thoughts", "general", "comment", "feedback", "insight"]
}

career_labels = {
    "Entrepreneur": ["entrepreneur", "founder", "business", "startup", "entrepreneurship", "funding","accelerator"],
    "Data Analytics": ["data analytics", "data analysis", "business intelligence", "BI", "data analyst"],
    "AI": ["artificial intelligence", "ai", "machine learning", "ML", "deep learning"],
    "Data Engineering": ["data engineering", "data pipeline", "data engineer", "data infrastructure", "ETL", "big data"],
    "ServiceNow": ["servicenow"],
    "Students": ["student", "students", "early career", "student"],
    "Salesforce": ["salesforce", "crm"],
    "Cloud": ["cloud", "aws", "azure", "gcp", "oracle", "cloud computing"],
    "Cybersecurity": ["cyber", "cybersecurity", "networking", "linux", "soc", "cyber security", "security", "penetration testing"],
    "UX": ["ux", "user experience", "ui", "design thinking", "interface", "human centered", "design", "ui"],
    "Product Management": ["product management", "product manager", "product", "agile"],
    "Product Design": ["product design", "product development"],
    "Project Management": ["project management", "pmp", "project planning"],
    "Digital Marketing": ["digital marketing", "social media", "seo", "content marketing"],
    "Software & Systems Engineering": ["software engineering", "systems engineering", "embedded systems", "systems architect"],
    "Data": ["data", "data science", "statistics"],
    "Software or Web Development": ["software development", "web development", "devops", "frontend", "backend", "full stack", "javascript", "react"],
    "Tech Sales": ["tech sales", "technical sales", "business development"],
    "Finance": ["finance", "financial", "accounting", "investment", "capital"],
    "Tech": ["tech", "technical", "technology", "FAANG", "Silicon Valley"],
    "Non-Tech": ["nontech", "non-tech", "non technical"],
    "GovTech": ["govtech", "gov-tech", "irs", "fema", "dod", "digital corps", "coding it forward", "digital service", "gsa", "tts", "public sector"]
}

# Assign labels based on keywords
def assign_labels(text, label_dict):
    for label, keywords in label_dict.items():
        if any(keyword.lower() in str(text).lower() for keyword in keywords):
            return label
    return None

df['resource_type'] = df['full_text'].apply(lambda x: assign_labels(x, resource_labels))
df['career_area'] = df['full_text'].apply(lambda x: assign_labels(x, career_labels))

# Extract links to a new field
df['resource_link'] = df['full_text'].str.extract(r'(https?://\S+)')

# Prepare data for model training - drop rows without labels
labeled_df = df.dropna(subset=['resource_type', 'career_area'])


In [None]:

# Step 2: Text vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X = tfidf.fit_transform(labeled_df['full_text'])

# Encode the labels
y_resource = labeled_df['resource_type']
y_career = labeled_df['career_area']

# Split data for resource type prediction
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X, y_resource, test_size=0.2, random_state=42)
X_train_career, X_test_career, y_train_career, y_test_career = train_test_split(X, y_career, test_size=0.2, random_state=42)

# Step 3: Train the classifier for resource type
resource_model = LogisticRegression(max_iter=200)
resource_model.fit(X_train_res, y_train_res)

# Train the classifier for career area
career_model = LogisticRegression(max_iter=200)
career_model.fit(X_train_career, y_train_career)

# Step 4: Evaluate both models
y_pred_res = resource_model.predict(X_test_res)
y_pred_career = career_model.predict(X_test_career)

# Classification reports
print("Resource Type Classification Report")
print(classification_report(y_test_res, y_pred_res))
print(f"Resource Type Accuracy: {accuracy_score(y_test_res, y_pred_res):.2f}")

print("\nCareer Area Classification Report")
print(classification_report(y_test_career, y_pred_career))
print(f"Career Area Accuracy: {accuracy_score(y_test_career, y_pred_career):.2f}")


**Accuracy:** 89% for Resource Type, 64% for Career Area.

**Strengths:** Improved handling of certain career categories, especially with better precision for "AI" and "Cloud."

**Weaknesses:** Many classes in Career Area (e.g., "Data Engineering" and "GovTech") are still not well captured, with low recall across some categories.

**Interpretation:** While Resource Type classification improved slightly, the added complexity reduced accuracy for Career Area, indicating overfitting on expanded labels without sufficient data for each category.

In [None]:
# prompt: show this in a df y_test_career, y_pred_career
pd.DataFrame({'y_test_career': y_test_career, 'y_pred_career': y_pred_career})

## Multi-Class Model

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import MultiLabelBinarizer

# Function to assign multiple labels based on keywords
def assign_multi_labels(text, label_dict):
    labels = []
    for label, keywords in label_dict.items():
        if any(keyword.lower() in str(text).lower() for keyword in keywords):
            labels.append(label)
    return labels if labels else ["general_discussion"]

# Load and preprocess the data
file_path = '/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/converted_tweets.csv'
df = pd.read_csv(file_path)

# Filter to items with a created date of October 1, 2019 or later
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df = df[df['created_at'] >= '2019-10-01']


# Apply multi-label assignment functions
df['resource_type'] = df['full_text'].apply(lambda x: assign_multi_labels(x, resource_labels))
df['career_area'] = df['full_text'].apply(lambda x: assign_multi_labels(x, career_labels))

# Convert labels to multi-label binary format using MultiLabelBinarizer
mlb_resource = MultiLabelBinarizer()
mlb_career = MultiLabelBinarizer()

y_resource = mlb_resource.fit_transform(df['resource_type'])
y_career = mlb_career.fit_transform(df['career_area'])

# Extract links to a new field
df['resource_link'] = df['full_text'].str.extract(r'(https?://\S+)')

In [None]:
# Text vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X = tfidf.fit_transform(df['full_text'])

# Split data for resource type prediction
X_train_res, X_test_res, y_train_res, y_test_res = train_test_split(X, y_resource, test_size=0.2, random_state=42)
X_train_career, X_test_career, y_train_career, y_test_career = train_test_split(X, y_career, test_size=0.2, random_state=42)

# Train multi-label classifier for resource type using Logistic Regression
resource_model = MultiOutputClassifier(LogisticRegression(max_iter=200))
resource_model.fit(X_train_res, y_train_res)

# Train multi-label classifier for career area using Logistic Regression
career_model = MultiOutputClassifier(LogisticRegression(max_iter=200))
career_model.fit(X_train_career, y_train_career)

# Evaluate both models
y_pred_res = resource_model.predict(X_test_res)
y_pred_career = career_model.predict(X_test_career)

# Display classification reports
print("Resource Type Classification Report")
print(classification_report(y_test_res, y_pred_res, target_names=mlb_resource.classes_))

print("\nCareer Area Classification Report")
print(classification_report(y_test_career, y_pred_career, target_names=mlb_career.classes_))


**Accuracy:** 95% for Resource Type (samples avg), 79% for Career Area (samples avg).

**Strengths:** Highest accuracy and balanced recall for Resource Type, good micro-average recall for Career Area. This model shows strong performance for multi-label classification, especially for labels that appear in combination.

**Weaknesses:** Some Career Area categories like "Non-Tech," "Digital Marketing," and "Finance" still have low recall and F1-scores.

**Interpretation:** This model benefits from multi-label flexibility, allowing each tweet to be assigned multiple resource and career labels, resulting in more accurate predictions. However, Career Area categories with limited training samples still suffer in performance.

In [None]:
# prompt: show the df with the predicted labels

# Predict labels for the entire dataset
y_pred_res_all = resource_model.predict(X)
y_pred_career_all = career_model.predict(X)

# Convert predicted labels back to original format
predicted_resource_labels = mlb_resource.inverse_transform(y_pred_res_all)
predicted_career_labels = mlb_career.inverse_transform(y_pred_career_all)

# Add predicted labels to the DataFrame
df['predicted_resource_labels'] = predicted_resource_labels
df['predicted_career_labels'] = predicted_career_labels

# Show the DataFrame with predicted labels
df[['full_text', 'predicted_resource_labels', 'predicted_career_labels']]

## Multi-label Binary Classification

In [None]:
# Vectorize the text
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X = tfidf.fit_transform(df['full_text'])

# Keep tweet_id as a separate variable
tweet_ids = df['tweet_id']

# Split data for both resource and career areas, ensuring tweet_id stays aligned
X_train_res, X_test_res, y_train_res, y_test_res, tweet_ids_train_res, tweet_ids_test_res = train_test_split(
    X, y_resource, tweet_ids, test_size=0.2, random_state=42
)
X_train_career, X_test_career, y_train_career, y_test_career, tweet_ids_train_career, tweet_ids_test_career = train_test_split(
    X, y_career, tweet_ids, test_size=0.2, random_state=42
)

# Train multi-label classifier for resource type using Logistic Regression
resource_model = MultiOutputClassifier(LogisticRegression(max_iter=200))
resource_model.fit(X_train_res, y_train_res)

# Train multi-label classifier for career area using Logistic Regression
career_model = MultiOutputClassifier(LogisticRegression(max_iter=200))
career_model.fit(X_train_career, y_train_career)

# Predict on the test set
y_pred_res = resource_model.predict(X_test_res)
y_pred_career = career_model.predict(X_test_career)

# Combine predictions with tweet IDs for easy joining
resource_results = pd.DataFrame(y_pred_res, columns=mlb_resource.classes_)
resource_results['tweet_id'] = tweet_ids_test_res.values
career_results = pd.DataFrame(y_pred_career, columns=mlb_career.classes_)
career_results['tweet_id'] = tweet_ids_test_career.values

# Display classification reports for both models
print("Resource Type Classification Report")
print(classification_report(y_test_res, y_pred_res, target_names=mlb_resource.classes_))

print("\nCareer Area Classification Report")
print(classification_report(y_test_career, y_pred_career, target_names=mlb_career.classes_))

**Accuracy:** 95% for Resource Type (samples avg), 79% for Career Area (samples avg).

**Strengths:** High precision across most Resource Type categories, strong performance on popular Career Area labels like AI, Tech, and general_discussion.

**Weaknesses:** Lower recall for underrepresented Career Area categories, such as Finance, GovTech, and Non-Tech.

**Interpretation:** This multi-label binary approach captures frequent categories well and can handle multiple labels per tweet, but would benefit from more data for less common labels and potentially further tuning.

**Recommendation:** Model 4 shows robust multi-label capabilities, making it the most versatile and accurate choice among the models tested.

In [None]:
# Final combined results with tweet_id to facilitate joining
final_results = pd.merge(resource_results, career_results, on="tweet_id", suffixes=('_resource', '_career'))
final_results.head()

In [None]:
# Predict labels for the entire original dataset without filtering
y_pred_res_all = resource_model.predict(X)
y_pred_career_all = career_model.predict(X)

# Convert predicted labels back to their original format
predicted_resource_labels = mlb_resource.inverse_transform(y_pred_res_all)
predicted_career_labels = mlb_career.inverse_transform(y_pred_career_all)

# Add predicted labels back to the original DataFrame
df['predicted_resource_labels'] = predicted_resource_labels
df['predicted_career_labels'] = predicted_career_labels

# Display the DataFrame with tweet_id, full_text, and predicted labels
df_with_predictions = df[['tweet_id', 'full_text', 'predicted_resource_labels', 'predicted_career_labels']]
df_with_predictions

### Remove general_discussion Labels

In [None]:
# Filter out rows where 'general_discussion' appears in predicted labels
df_filtered = df[~df['predicted_resource_labels'].apply(lambda x: 'general_discussion' in x)].copy()
df_filtered_with_predictions = df_filtered[~df_filtered['predicted_career_labels'].apply(lambda x: 'general_discussion' in x)].copy()

# Display tweet_id, full_text, and predicted labels without general_discussion
df_filtered_with_predictions[['tweet_id', 'full_text', 'predicted_resource_labels', 'predicted_career_labels']]
df_filtered_with_predictions

In [None]:
# prompt: df_with_predictions to csv name classified_tweets

# Assuming df_with_predictions is already defined as in your code

df_with_predictions.to_csv('/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/classified_tweets.csv', index=False)

In [None]:
df_filtered_with_predictions.to_csv('/content/drive/MyDrive/TwitterLinkedIn_AI_ML_Project/classified_filtered_tweets.csv', index=False)

**Suggested Improvements:**

**Increase Sample Diversity:** If possible, gather more labeled examples, especially for underrepresented categories in Career Area (e.g., "Finance," "Non-Tech").

**Data Augmentation:** For Career Area classes with low samples, consider synthetic data generation (e.g., paraphrasing techniques) to balance class distribution.

**Use Pre-trained Embeddings:** Switch to embeddings (e.g., BERT or TF-IDF with n-grams) to capture more context in tweets, which may help distinguish nuanced Career Area labels.

**Recommended Model:**

Model 3 is the most promising due to its multi-label approach and balanced performance across Resource Type and Career Area. Further tuning and potentially more data can improve Career Area classification for low-sample classes.

In [None]:
save_and_push_to_github("Twitter Classification Update")

# Analysis





### Performance Analysis:


# LinkedIn