In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [22]:
# Load the dataset
df = pd.read_csv('collaborative.csv')

In [23]:
df.columns

Index(['post_id', 'user_id', 'user_name_x', 'caption', 'image_url',
       'timestamp', 'interaction', 'hashtags', 'location_x', 'post_type',
       'user_name_y', 'follower_count', 'following_count', 'location_y',
       'age_group', 'gender', 'activity_level'],
      dtype='object')

In [24]:
df['text'] = df['caption'] + ' ' + df['hashtags']

In [25]:
# Define the features (captions + hashtags) and target (interaction or post_type)
X = df['text']
y_caption = df['caption']  # We'll try to predict captions
y_hashtags = df['hashtags']  # We'll try to predict hashtags

In [26]:
# Split the data into training and test sets
X_train_caption, X_test_caption, y_train_caption, y_test_caption = train_test_split(
    X, y_caption, test_size=0.2, random_state=42)
X_train_hashtag, X_test_hashtag, y_train_hashtag, y_test_hashtag = train_test_split(
    X, y_hashtags, test_size=0.2, random_state=42)

In [27]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [28]:
# Fit the TF-IDF vectorizer on the training data for captions
X_train_caption_tfidf = tfidf_vectorizer.fit_transform(X_train_caption)
X_test_caption_tfidf = tfidf_vectorizer.transform(X_test_caption)

In [29]:
# Fit the TF-IDF vectorizer on the training data for hashtags
X_train_hashtag_tfidf = tfidf_vectorizer.fit_transform(X_train_hashtag)
X_test_hashtag_tfidf = tfidf_vectorizer.transform(X_test_hashtag)

In [30]:
# Model Training for Caption Prediction using RandomForest
caption_model = RandomForestClassifier(n_estimators=100, random_state=42)
caption_model.fit(X_train_caption_tfidf, y_train_caption)

In [31]:
# Model Training for Hashtag Prediction using RandomForest
hashtag_model = RandomForestClassifier(n_estimators=100, random_state=42)
hashtag_model.fit(X_train_hashtag_tfidf, y_train_hashtag)

In [32]:
# Predict captions and hashtags
y_pred_caption = caption_model.predict(X_test_caption_tfidf)
y_pred_hashtag = hashtag_model.predict(X_test_hashtag_tfidf)

In [33]:
# Evaluate the caption prediction model
print("Caption Prediction Report:")
print(classification_report(y_test_caption, y_pred_caption))

Caption Prediction Report:
                                            precision    recall  f1-score   support

                      A sad day for us all       1.00      1.00      1.00         6
                                 Beach day       1.00      1.00      1.00         2
                        Chasing waterfalls       1.00      1.00      1.00         4
                               City lights       1.00      1.00      1.00         1
                               Coffee time       1.00      1.00      1.00         2
                       Dinner with friends       1.00      1.00      1.00         3
                       Enjoying the sunset       1.00      1.00      1.00         2
                   Exploring the mountains       1.00      1.00      1.00         3
                               Family time       1.00      1.00      1.00         1
                           Feeling blessed       1.00      1.00      1.00         6
                         Healthy breakfast      

In [34]:
# Evaluate the hashtag prediction model
print("Hashtag Prediction Report:")
print(classification_report(y_test_hashtag, y_pred_hashtag))

Hashtag Prediction Report:
                                   precision    recall  f1-score   support

             #anxious #sad #faces       1.00      1.00      1.00         3
                #beach #sun #sand       1.00      1.00      1.00         2
     #blessed #grateful #thankful       1.00      1.00      1.00         6
        #breakfast #healthy #food       1.00      1.00      1.00         3
          #city #nightlife #urban       1.00      1.00      1.00         1
       #coffee #morning #caffeine       1.00      1.00      1.00         2
      #depression #stress #trauma       1.00      1.00      1.00         6
      #dinner #friends #goodtimes       1.00      1.00      1.00         3
       #family #love #qualitytime       1.00      1.00      1.00         1
                #happy #joy #feel       1.00      1.00      1.00         7
#healthyliving #fitness #wellness       1.00      1.00      1.00         4
     #hiking #outdoors #adventure       1.00      1.00      1.00        

In [35]:
# Overall accuracy scores
caption_accuracy = accuracy_score(y_test_caption, y_pred_caption)
hashtag_accuracy = accuracy_score(y_test_hashtag, y_pred_hashtag)

In [36]:
print(f"Caption Prediction Accuracy: {caption_accuracy * 100:.2f}%")
print(f"Hashtag Prediction Accuracy: {hashtag_accuracy * 100:.2f}%")

Caption Prediction Accuracy: 100.00%
Hashtag Prediction Accuracy: 100.00%


In [47]:
import joblib
joblib.dump(caption_model, 'caption.pkl')
joblib.dump(hashtag_model, 'hashtag.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# test the user input model
caption_model = joblib.load('caption.pkl')
hashtag_model = joblib.load('hashtag.pkl')
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

user_input = "Hello I am Happy # #caption"
user_input_tfidf = tfidf_vectorizer.transform([user_input])
user_input_tfidf = user_input_tfidf.toarray()

predicted_caption = caption_model.predict(user_input_tfidf)
predicted_hashtag = hashtag_model.predict(user_input_tfidf)

# give multiple output
print(predicted_caption)
print(predicted_hashtag)

['I am feeling very happy']
['#happy #joy #feel']
