In [81]:
import pandas as pd
data = pd.read_csv("../data/linkedin_posts.csv")
data2 = pd.read_csv("../data/hate_speech.csv")

length1 = len(data)

In [82]:
# Append data2['text'] to data['Post Content']
new_rows = pd.DataFrame({'Post Content': data2['text'].tolist()})
data = pd.concat([data, new_rows], ignore_index=True)
# Convert all post contents to lowercase
data['Post Content'] = [text.lower() for text in data['Post Content']]
# Remove URLs
data['Post Content'] = data['Post Content'].str.replace(r'\bhttps?://\S+\b', '', regex=True)
# Remove punctuation (periods, commas, question marks, double quotes, etc.)
data['Post Content'] = data['Post Content'].str.replace(r'[^a-zA-Z ]', '', regex=True)

post_content = data['Post Content'].tolist()

In [83]:
import re

# Save post_content to a file called posts.txt
with open("posts.txt", "w", encoding="utf-8") as f:
    i = 0
    for post in post_content:
        i += 1
        f.write(str(i) + ". " + post + "\n")

In [84]:
# Read classified.txt and append its content as a new column to the data dataframe
with open("classified.txt", "r", encoding="utf-8") as f_classified:
    classified_labels = [line.strip() for line in f_classified]
    
length2 = len(classified_labels)

for i in range(length2):
    if re.search(r'\bnot\b', str(classified_labels[i]), re.IGNORECASE):
        classified_labels[i] = 0 #not cringe
    else:
        classified_labels[i] = 1 #cringe

print(len(classified_labels)) #should be 716
    
classified_labels.extend([1] * len(data2)) #for the hate speech posts

print(len(classified_labels))


716
41339


In [85]:
final_data = data.drop(index=range(length2, length1)).reset_index(drop=True)
final_data['Classified'] = classified_labels

final_data = final_data.iloc[:2400]

final_data = final_data.sample(frac=1, random_state=42).reset_index(drop=True)

test_data = final_data.iloc[1200:].copy()
final_data = final_data.iloc[:1200]

In [86]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(sentences=final_data["Post Content"], vector_size=100, window=5, min_count=2, workers=4, sg = 1) #sg = 1 for skip gram
word2vec_model.save("word2vec.model") #save to file named word2vec.model

In [87]:
import re

def analyze_linkedin_post(paragraph: str) -> dict:
    """
    Analyzes a LinkedIn post paragraph for emotional baiting keywords,
    structural features, and semantic indicators.

    Args:
        paragraph (str): The text content of the LinkedIn post.

    Returns:
        dict: A dictionary containing various analysis features.
    """

    results = {}
    
    # Count words that are all caps (and longer than 1 character to exclude 'I' or 'A')
    all_caps_words_count = 0
    for word in re.findall(r'\b[A-Z]+\b', paragraph): # Find all uppercase words in original case
        if len(word) > 1: # Exclude single-letter words like "I" or "A"
            all_caps_words_count += 1
    results['num_all_caps_words'] = all_caps_words_count
    
    results['num_exclamation_marks'] = paragraph.count('!')
    results['num_question_marks'] = paragraph.count('?')


    # --- Preprocessing ---
    lower_paragraph = paragraph.lower()
    words = re.findall(r'\b\w+\b', lower_paragraph) # Tokenize words, stripping punctuation
    num_words = len(words)
    num_chars = len(paragraph)

    # --- Keyword/Phrase Definitions (Case-insensitive matching) ---
    selling_course_keywords = ["buy", "enroll now", "masterclass", "exclusive program",
                                "webinar", "bootcamp", "course", "training", "workshop",
                                "learn how to", "join now", "sign up"]

    emotional_story_keywords = ["my journey", "struggle", "struggled", "tears",
                                "breakthrough", "against all odds", "hit rock bottom",
                                "lowest point", "lost", "sacrifice", "sacrificed",
                                "overwhelmed", "defeated", "almost gave up", "resilience",
                                "perseverance", "vulnerability"]

    clickbait_headline_regex = re.compile(
        r"(you won't believe|the secret to|this one trick|shocking truth|mind-blowing|"
        r"game-changer|defied expectations|achieved the impossible|unprecedented|"
        r"revolutionary|what happened next|revealed|hidden truth)",
        re.IGNORECASE
    )

    tag_people_cta_keywords = ["tag", "like if you agree", "share this post",
                                "comment if you agree", "react with", "mention a friend",
                                "spread the word", "repost"]

    comment_interested_cta_keywords = ["comment 'interested'", "type 'yes'",
                                        "dm me", "message me", "inbox me",
                                        "comment below", "let me know",
                                        "drop a comment", "tell me your thoughts",
                                        "what do you think", "join the conversation",
                                        "let's discuss", "share your opinion",
                                        "i want to hear from you", "let's connect",
                                        "i'd love to know", "what's your take",
                                        "i'm curious", "let's chat", "let's talk",
                                        "i'm interested", "let's engage", "let's collaborate",
                                        "let's brainstorm", "i want to hear"]

    humble_brag_keywords = ["blessed", "didn't expect this",
                            "just reached", "grateful", "honored", "overwhelmed",
                            "speechless", "never imagined", "a dream come true", "pinch myself"]

    purely_personal_topic_keywords = ["vacation", "family", "kids", "children",
                                        "wedding", "anniversary", "birthday", "pets",
                                        "dog", "cat", "home life", "weekend vibes", "my life",
                                        "personal story", "holiday", "my spouse", "partner",
                                        "date night", "travel", "adventure", "my journey",
                                        "self-care", "mental health", "wellness",
                                        "hobbies", "interests", "lifestyle", "daily routine",
                                        "life update", "my passion", "hobby"]

    generic_advice_keywords = ["never give up", "keep learning", "be persistent",
                                "stay hungry", "stay foolish", "growth mindset",
                                "daily habits", "consistency is key", "believe in yourself",
                                "manifest your dreams", "your why", "find your passion",
                                "embrace failure", "learn from mistakes", "stay positive"]

    # --- Presence of Keywords/Phrases ---
    def check_keywords_presence(text, keywords):
        return any(keyword in text for keyword in keywords)

    results['is_selling_course_keyword_present'] = check_keywords_presence(lower_paragraph, selling_course_keywords)
    results['is_emotional_story_keyword_present'] = check_keywords_presence(lower_paragraph, emotional_story_keywords)
    results['is_clickbait_headline_present'] = bool(clickbait_headline_regex.search(paragraph)) # Use original case for regex
    results['is_tag_people_call_to_action'] = check_keywords_presence(lower_paragraph, tag_people_cta_keywords)
    results['is_comment_interested_cta'] = check_keywords_presence(lower_paragraph, comment_interested_cta_keywords)
    results['is_humble_brag_keyword_present'] = check_keywords_presence(lower_paragraph, humble_brag_keywords)
    results['post_length_words'] = num_words
    results['post_length_chars'] = num_chars


    # --- Semantic/Topical Features ---
    results['is_purely_personal_topic'] = check_keywords_presence(lower_paragraph, purely_personal_topic_keywords)
    results['is_generic_advice'] = check_keywords_presence(lower_paragraph, generic_advice_keywords)

    # --- Ratio-based Features ---
    results['exclamation_to_word_ratio'] = results['num_exclamation_marks'] / num_words if num_words > 0 else 0

    return results

# Example usage
example_paragraph = "I just completed a masterclass on emotional intelligence! You won't believe the insights I gained. #EmotionalIntelligence #GrowthMindset"
analysis_results = analyze_linkedin_post(example_paragraph)
print(analysis_results)

{'num_all_caps_words': 0, 'num_exclamation_marks': 1, 'num_question_marks': 0, 'is_selling_course_keyword_present': True, 'is_emotional_story_keyword_present': False, 'is_clickbait_headline_present': True, 'is_tag_people_call_to_action': False, 'is_comment_interested_cta': False, 'is_humble_brag_keyword_present': False, 'post_length_words': 18, 'post_length_chars': 136, 'is_purely_personal_topic': False, 'is_generic_advice': False, 'exclamation_to_word_ratio': 0.05555555555555555}


In [88]:
import tensorflow as tf
keras = tf.keras
models = tf.keras.models
layers = tf.keras.layers
tokenizer_module = tf.keras.preprocessing.text
sequence_module = tf.keras.preprocessing.sequence
Tokenizer = tokenizer_module.Tokenizer
pad_sequences = sequence_module.pad_sequences
import numpy as np

# --- 1. Define Model Hyperparameters (YOU NEED TO SET MAX_SEQUENCE_LENGTH) ---
# This is the maximum number of words/tokens in any input post.
# You need to determine this based on your dataset's average/max post length.
# A good starting point is often the 90th or 95th percentile of your post lengths.
MAX_SEQUENCE_LENGTH = 150 # Example: assuming most posts are <= 150 words
EMBEDDING_DIM = 100       # This comes from your Word2Vec model's vector_size

# Number of classes for binary classification (Good/Bad)
NUM_CLASSES = 1 # For binary classification, output a single probability

# --- 2. Build the CNN Model ---
def build_text_cnn_model(max_seq_length, embedding_dim, num_classes):
    model = models.Sequential()

    # Layer 1: Conv1D (Convolutional Layer for text)
    # filters: Number of feature detectors/filters to learn (e.g., to detect n-gram patterns)
    # kernel_size: The size of the sliding window (e.g., 5 words at a time)
    # activation: ReLU for non-linearity
    # input_shape: (sequence_length, embedding_dimension)
    model.add(layers.Conv1D(filters=128, kernel_size=5, activation='relu',
                            input_shape=(max_seq_length, embedding_dim)))
    # Note: We're starting with 128 filters. You can experiment with this.

    # Layer 2: MaxPooling1D (Pooling Layer for text)
    # pool_size: The size of the pooling window. Reduces sequence length.
    model.add(layers.MaxPooling1D(pool_size=2))
    # This will reduce the length of the sequence by half.

    # Optional: You can add more Conv1D and MaxPooling1D layers for deeper feature extraction
    # model.add(layers.Conv1D(filters=64, kernel_size=3, activation='relu'))
    # model.add(layers.MaxPooling1D(pool_size=2))

    # Layer 3: Flatten Layer
    # Converts the 2D output from MaxPooling1D (length x filters) into a 1D vector
    # This prepares the data for the fully connected (Dense) layers.
    model.add(layers.Flatten())

    # Layer 4: Dense (Fully Connected) Hidden Layer
    # units: Number of neurons in this layer.
    # activation: ReLU for non-linearity.
    model.add(layers.Dense(units=64, activation='relu'))
    # You can experiment with the number of units here.

    # Layer 5: Dense (Output Layer)
    # units: 1 for binary classification (predicting a probability).
    # activation: 'sigmoid' to output a probability between 0 and 1.
    model.add(layers.Dense(units=num_classes, activation='sigmoid'))

    return model

# Create the model instance
model = build_text_cnn_model(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, NUM_CLASSES)

# Print a summary of the model's architecture
model.summary()

# --- 3. Compile the Model ---
# optimizer: How the model updates its weights during training. Adam is a good default.
# loss: Binary Crossentropy for binary classification problems.
# metrics: What to monitor during training (e.g., accuracy).
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [89]:
# --- 4. Prepare Your Data (Conceptual Steps - you'll need to implement this) ---
from sklearn.model_selection import train_test_split

# Assuming you have:
# - `all_posts`: A list of your raw text strings (e.g., ["Just published a new article...", "This course is great...", ...])
# - `all_labels`: A list of corresponding labels (e.g., [0, 1, 0, 1, ...], where 0=Good, 1=Bad)
# - `word2vec_model`: Your trained Word2Vec model

# 1. Tokenize your posts (convert text to lists of words)
# Example (using Keras's Tokenizer for simplicity, but you might use custom tokenization):
tokenizer = Tokenizer()
tokenizer.fit_on_texts(final_data['Post Content'].tolist()) # Fit tokenizer on your post content
word_index = tokenizer.word_index # Vocabulary mapping word to ID

# 2. Convert posts to sequences of word IDs (using your tokenizer)
sequences = tokenizer.texts_to_sequences(final_data['Post Content'].tolist())

# 3. Pad/Truncate sequences to MAX_SEQUENCE_LENGTH
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

# 4. Convert padded sequences of word IDs into sequences of Word2Vec vectors
# This is the crucial step to get the input shape (None, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
# Example:
X = np.zeros((len(padded_sequences), MAX_SEQUENCE_LENGTH, EMBEDDING_DIM))
for i, sequence in enumerate(padded_sequences):
    for j, word_id in enumerate(sequence):
        word = tokenizer.index_word.get(word_id) # Convert ID back to word
        if word in word2vec_model.wv: # Check if word is in Word2Vec vocabulary
            X[i, j] = word2vec_model.wv[word]
        else:
            # Handle out-of-vocabulary words (e.g., keep as zeros or use a special UNK vector)
            pass
y = np.array(final_data['Classified'].tolist()) # Convert labels to numpy array

# --- 5. Split Data (Training, Validation) ---
# This code splits your data into training and validation sets, but does not create a separate test set.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 6. Train the Model (Conceptual) ---
model.fit(X_train, y_train,
            epochs=10, # Number of training iterations
            batch_size=32, # Number of samples per gradient update
            validation_data=(X_val, y_val)) # Data to evaluate on after each epoch

# --- 7. Evaluate the Model (Conceptual) ---
loss, accuracy = model.evaluate(X_train, y_train) ##fix to x and y test later!!
print(f"Test Accuracy: {accuracy*100:.2f}%")

Epoch 1/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.7482 - loss: 0.5819 - val_accuracy: 0.8250 - val_loss: 0.3871
Epoch 2/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8748 - loss: 0.3189 - val_accuracy: 0.8583 - val_loss: 0.3358
Epoch 3/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9033 - loss: 0.2587 - val_accuracy: 0.8583 - val_loss: 0.3562
Epoch 4/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9019 - loss: 0.2612 - val_accuracy: 0.8708 - val_loss: 0.3685
Epoch 5/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9279 - loss: 0.2087 - val_accuracy: 0.8583 - val_loss: 0.3691
Epoch 6/10
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.9393 - loss: 0.1972 - val_accuracy: 0.8583 - val_loss: 0.4180
Epoch 7/10
[1m30/30[0m [32m━━━━

In [90]:

# --- 8. Make Predictions (Conceptual) ---
new_post_text = ""
for i in range(len(test_data)):
    if test_data['Classified'].iloc[i] == 0:
        new_post_text = test_data['Post Content'].iloc[i]
        break
    
print(new_post_text)

# Preprocess new_post_text into a single Word2Vec sequence (X_new)
sequence = tokenizer.texts_to_sequences([new_post_text])
padded = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
X_new = np.zeros((1, MAX_SEQUENCE_LENGTH, EMBEDDING_DIM))
for j, word_id in enumerate(padded[0]):
    word = tokenizer.index_word.get(word_id)
    if word in word2vec_model.wv:
        X_new[0, j] = word2vec_model.wv[word]
    # else: leave as zeros

prediction_proba = model.predict(X_new)[0][0]
if prediction_proba > 0.5: # Threshold for "bad"
    print("This post is classified as BAD.")
else:
    print("This post is classified as GOOD.")

automation vs human touch customer experience the ultimate balance real scenarios ive faced  customer service  ai chatbots or human representatives  manufacturing  robotic efficiency or artisanal quality  data analysis  algorithmdriven or humaninterpreted  creative work  aigenerated or humancrafted  decisionmaking  datadriven or experiencebased theres no onesizefitsall solution for the future workplace what ive learned  automation improves efficiency but human touch drives loyalty  the perfect mix hightech hightouch  people crave personalization even in a digital world  continuous learning is the new job security  the best results often come from humanai collaboration the takeaway the future isnt human vs machine its human and machine  repost  to help others hit the  to get notified follow nitin mathur   hashtag  automationvshuman hashtag  customerexperience hashtag  aiethics hashtag  businessinnovation hashtag  linkedinautomation more


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
This post is classified as GOOD.
