In [1]:
import numpy as np # for linear algebra
import pandas as pd # for data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

In [3]:
df = pd.read_csv('Instagram_data.csv', encoding='latin1')
#Character encoding is important because it determines how characters are represented as bytes in the file

FileNotFoundError: [Errno 2] No such file or directory: 'Instagram_data.csv'

In [None]:
df.head()

In [None]:
df.head(2)

# Data Exploration:

In [None]:
df.columns

In [None]:
df.info()
#helps obtain a concise summary of a DataFrame's structure and information

In [None]:
df.isna().sum()

In [None]:
df.nunique()

In [None]:
df.describe().transpose()

# Correlation Analysis:

In [None]:
# Calculate the correlation matrix
correlation_matrix = df.corr()

# Print the correlation matrix
print(correlation_matrix)
print()

# Focus on the correlation between "Impressions" and other variables
impressions_correlations = correlation_matrix['Impressions'].drop('Impressions')
print(impressions_correlations)

# Feature Engineering:

In [None]:
# Derive the "Engagement Rate" feature
df['Engagement Rate'] = (df['Likes'] + df['Comments'] + df['Shares']) / df['Impressions']

# Print the updated DataFrame
print(df.head())

In [None]:
df.head(10).transpose()

In [None]:
# @title Impressions vs From Home

from matplotlib import pyplot as plt
df.plot(kind='scatter', x='Impressions', y='From Home', s=32, alpha=.8)
#alpha value sets transparecy : 80% opaque ; s: sets marker pixel size
plt.gca().spines[['top', 'right',]].set_visible(False)

# Exploratory Data Analysis:

In [None]:
df.columns

In [None]:
# Analyze trends and patterns in features
# Example 1: Trend of Reach over time
plt.figure(figsize=(12, 6))
sns.lineplot(x=df.index, y=df['Impressions'])
plt.xlabel('Post Index')
plt.ylabel('Reach (Impressions)')
plt.title('Trend of Reach over Time')
plt.show()

In [None]:
# Example 2: Bar chart of Reach from Different Sources
plt.figure(figsize=(10, 6))
sns.barplot(x=['From Home', 'From Hashtags', 'From Explore', 'From Other'],
            y=[df['From Home'].mean(), df['From Hashtags'].mean(), df['From Explore'].mean(), df['From Other'].mean()])
plt.xlabel('Source')
plt.ylabel('Reach')
plt.title('Reach from Different Sources')
plt.show()

In [None]:
# Analyze correlations between features
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Analyze factors such as post type, time of posting, and content themes
# Example 3: Box plot of Reach by Post Type
plt.figure(figsize=(22, 6))
sns.boxplot(x=df['Profile Visits'], y=df['Impressions'])
plt.xlabel('Profile Visits')
plt.ylabel('Reach (Impressions)')
plt.title('Reach by Post Type')
plt.show()

In [None]:
# Example 4: Word cloud of the most used hashtags
from wordcloud import WordCloud

hashtags = ' '.join(df['Hashtags'].dropna().tolist())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(hashtags)

plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Used Hashtags')
plt.show()

In [None]:
df.index

In [None]:
df.columns

# Predictive Modeling:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the Instagram data into a DataFrame
df = pd.read_csv('Instagram_data.csv', encoding='latin1')

# Select the relevant features for prediction
X = df[['From Home', 'From Hashtags', 'From Explore', 'From Other', 'Saves', 'Comments', 'Shares', 'Likes', 'Profile Visits', 'Follows']]
y = df['Impressions']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Select and train the machine learning model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model's performance using appropriate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Use the trained model to predict the reach for new or future posts
new_data = pd.DataFrame({
    'From Home': [10],
    'From Hashtags': [5],
    'From Explore': [2],
    'From Other': [3],
    'Saves': [50],
    'Comments': [20],
    'Shares': [10],
    'Likes': [100],
    'Profile Visits': [30],
    'Follows': [5]
})

predicted_reach = model.predict(new_data)
print("Predicted Reach:", predicted_reach)


# Interpretation and Recommendations:

In [None]:

from sklearn.linear_model import LinearRegression

# Load the Instagram data into a DataFrame
df = pd.read_csv('Instagram_data.csv', encoding='latin1')

# Select the relevant features for prediction
X = df[['From Home', 'From Hashtags', 'From Explore', 'From Other', 'Saves', 'Comments', 'Shares', 'Likes', 'Profile Visits', 'Follows']]
y = df['Impressions']

# Train the machine learning model
model = LinearRegression()
model.fit(X, y)

# Interpret the coefficients or feature importances
coefficients = model.coef_
# these coefficients represent the weights assigned to each feature in the linear equation

print(coefficients)
print("\n")
feature_importances = abs(coefficients) / abs(coefficients).sum()

# Identify the key factors that significantly influence reach
key_factors = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': coefficients,
    'Feature Importance': feature_importances
})
key_factors = key_factors.sort_values(by='Coefficient', ascending=False)

# Print the key factors and their impact on reach
print(key_factors)

# Recommendations to optimize reach

# 1. Optimize the use of hashtags:
# Based on the impact on reach from hashtags, analyze the hashtags used in successful posts.
# Identify popular and relevant hashtags in the niche and incorporate them strategically.

# 2. Encourage engagement:
# Focus on generating saves, comments, and shares as they positively influence reach.
# Create compelling and shareable content that encourages interaction and discussion among the audience.

# 3. Analyze post types and content themes:
# Explore the impact of different post types (photos, videos) on reach.
# Analyze the performance of various content themes or topics to identify what resonates best with the audience.

# 4. Understand reach from different sources:
# Analyze the reach from different sources such as home, explore, and others.
# Determine the distribution of audience engagement and consider adjusting strategies to target specific sources more effectively.


In [None]:
df.to_csv(' Instagram_Reach_Analysis_data.csv', index=True)