In [None]:
# Import necessary libraries
import pandas as pd
from textblob import TextBlob

# Step 1: Load the CSV file
df = pd.read_csv('D:\Github\SocialSentinel\data\d4\instagram_reach.csv')

# Step 2: Data Preprocessing
# Keep only relevant columns (Caption, Hashtags)
df = df[['Caption', 'Hashtags']]

# Remove any missing or NaN values
df.dropna(subset=['Caption', 'Hashtags'], inplace=True)

# Step 3: Sentiment Analysis Function
def analyze_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Returns a score between -1 and 1

# Step 4: Calculate Sentiment Scores for Caption and Hashtags
df['caption_score'] = df['Caption'].apply(analyze_sentiment)
df['hashtag_score'] = df['Hashtags'].apply(analyze_sentiment)

# Step 5: Combine Scores to Determine Overall Sentiment
df['overall_score'] = (df['caption_score'] + df['hashtag_score']) / 2

# Step 6: Categorize Sentiment
def categorize_sentiment(score):
    if score < 0:
        return 'Negative'
    elif score > 0:
        return 'Positive'
    else:
        return 'Neutral'

df['sentiment'] = df['overall_score'].apply(categorize_sentiment)

# Step 7: Create a new DataFrame with required columns
final_df = df[['Caption', 'Hashtags', 'sentiment', 'overall_score']]

# Step 8: Save to a new CSV file
final_df.to_csv('instagram_reach_with_sentiments.csv', index=False)

print("Sentiment analysis complete! New file saved as 'instagram_reach_with_sentiments.csv'.")

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Step 1: Load the new CSV file
df = pd.read_csv('instagram_reach_with_sentiments.csv')

# Step 2: Prepare Data for Machine Learning
X = df['Caption'] + " " + df['Hashtags']  # Features (text data)
y = df['sentiment']  # Labels (sentiment categories)

# Step 3: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 4: Convert Text Data into Numerical Features
tfidf = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Step 5: Use Random Over Sampling to Balance Classes
ros = RandomOverSampler(random_state=42)
X_train_balanced, y_train_balanced = ros.fit_resample(X_train_tfidf, y_train)

print("Original training set shape:", Counter(y_train))
print("Balanced training set shape:", Counter(y_train_balanced))

# Step 6: Train a Machine Learning Model (Logistic Regression)
model = LogisticRegression(class_weight='balanced')  # Handle class imbalance by adjusting weights
model.fit(X_train_balanced, y_train_balanced)

# Step 7: Evaluate the Model
y_pred = model.predict(X_test_tfidf)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Step 8: Make Predictions on New Data
def predict_sentiment(new_caption, new_hashtags):
    new_text = new_caption + " " + new_hashtags
    new_tfidf = tfidf.transform([new_text])
    return model.predict(new_tfidf)[0]


# Example usage
new_caption = "hello"
new_hashtags = "#programming"
predicted_sentiment = predict_sentiment(new_caption, new_hashtags)
print(f"Predicted Sentiment: {predicted_sentiment}")

new_caption = "The Internet of Things : A Very Short Story."
new_hashtags = "#MachineLearning"
predicted_sentiment = predict_sentiment(new_caption, new_hashtags)
print(f"Predicted Sentiment: {predicted_sentiment}")

Original training set shape: Counter({'Positive': 45, 'Neutral': 25, 'Negative': 5})
Balanced training set shape: Counter({'Positive': 45, 'Neutral': 45, 'Negative': 45})
Classification Report:
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00         2
     Neutral       1.00      0.33      0.50         6
    Positive       0.65      1.00      0.79        11

    accuracy                           0.68        19
   macro avg       0.55      0.44      0.43        19
weighted avg       0.69      0.68      0.61        19

Predicted Sentiment: Positive
Predicted Sentiment: Positive


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
new_caption = "i AM very sad"
new_hashtags = "sad"
predicted_sentiment = predict_sentiment(new_caption, new_hashtags)
print(f"Predicted Sentiment: {predicted_sentiment}")

Predicted Sentiment: Positive


In [None]:
# Import necessary libraries
import pandas as pd
from textblob import TextBlob
import pickle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# Using Random Forest for better accuracy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Step 1: Load the CSV file
df = pd.read_csv('D:\Github\SocialSentinel\test\sentiments.csv')

# Step 2: Data Preprocessing
# Keep only relevant columns (Caption, Hashtags)
df = df[['Caption', 'Hashtags']]

# Remove any missing or NaN values
df.dropna(subset=['Caption', 'Hashtags'], inplace=True)

# Step 3: Sentiment Analysis Function
def analyze_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity 


# Step 4: Calculate Sentiment Scores for Caption and Hashtags
df['caption_score'] = df['Caption'].apply(analyze_sentiment)
df['hashtag_score'] = df['Hashtags'].apply(analyze_sentiment)

# Step 5: Combine Scores to Determine Overall Sentiment
df['overall_score'] = (df['caption_score'] + df['hashtag_score']) / 2

# Step 6: Categorize Sentiment

def categorize_sentiment(score):
    if score < 0:
        return 'Negative'
    elif score > 0:
        return 'Positive'
    else:
        return 'Neutral'


df['sentiment'] = df['overall_score'].apply(categorize_sentiment)

# Step 7: Create a new DataFrame with required columns
final_df = df[['Caption', 'Hashtags', 'sentiment', 'overall_score']]

# Step 8: Save to a new CSV file
final_df.to_csv('instagram_reach_with_sentiments.csv', index=False)

print("Sentiment analysis complete! New file saved as 'instagram_reach_with_sentiments.csv'.")

# Load the new CSV file for training the model
df = pd.read_csv('instagram_reach_with_sentiments.csv')

# Prepare Data for Machine Learning
X = df['Caption'] + " " + df['Hashtags']  # Features (text data)
y = df['sentiment']  # Labels (sentiment categories)

# Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Convert Text Data into Numerical Features
tfidf = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Use Random Over Sampling to Balance Classes
ros = RandomOverSampler(random_state=42)
X_train_balanced, y_train_balanced = ros.fit_resample(X_train_tfidf, y_train)

print("Original training set shape:", Counter(y_train))
print("Balanced training set shape:", Counter(y_train_balanced))

# Train a Machine Learning Model (Random Forest Classifier)
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train_balanced, y_train_balanced)

# Save the model and the TF-IDF vectorizer
with open('sentiment_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf, vectorizer_file)

# Evaluate the Model
y_pred = model.predict(X_test_tfidf)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Make Predictions on New Data
def predict_sentiment(new_caption, new_hashtags):
    # Load the model and vectorizer
    with open('sentiment_model.pkl', 'rb') as model_file:
        loaded_model = pickle.load(model_file)

    with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
        loaded_vectorizer = pickle.load(vectorizer_file)

    new_text = new_caption + " " + new_hashtags
    new_tfidf = loaded_vectorizer.transform([new_text])
    return loaded_model.predict(new_tfidf)[0]

In [2]:
new_caption = "I am happy"
new_hashtags = "#MachineLearning"
predicted_sentiment = predict_sentiment(new_caption, new_hashtags)
print(f"Predicted Sentiment: {predicted_sentiment}")

Predicted Sentiment: Neutral


In [10]:
import pickle
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the Random Forest model
with open('sentiment_model.pkl', 'rb') as random_forest_model_file:
    random_forest_model = pickle.load(random_forest_model_file)

# Load the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    tfidf_vectorizer = pickle.load(vectorizer_file)

# Define the test data
test_data = pd.DataFrame(
    {'text': ['I love this product', 'This product is terrible']})

# Preprocess the text data


def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text


# Apply preprocessing
test_data['text'] = test_data['text'].apply(preprocess_text)

# Vectorize the preprocessed text data
X_test = tfidf_vectorizer.transform(test_data['text'])

# Make predictions
predictions = random_forest_model.predict(X_test)

# Print the results
for text, prediction in zip(test_data['text'], predictions):
    print(f"Text: '{text}' - Sentiment: {prediction}")

FileNotFoundError: [Errno 2] No such file or directory: 'tfidf_vectorizer.pkl'