In [1]:
#import packages

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

In [3]:
#load the dataset and display the data

data = pd.read_csv("/content/Instagram data.csv", encoding='latin-1')
print(data.head(10))

   Impressions  From Home  From Hashtags  From Explore  From Other  Saves  \
0         3920       2586           1028           619          56     98   
1         5394       2727           1838          1174          78    194   
2         4021       2085           1188             0         533     41   
3         4528       2700            621           932          73    172   
4         2518       1704            255           279          37     96   
5         3884       2046           1214           329          43     74   
6         2621       1543            599           333          25     22   
7         3541       2071            628           500          60    135   
8         3749       2384            857           248          49    155   
9         4115       2609           1104           178          46    122   

   Comments  Shares  Likes  Profile Visits  Follows  \
0         9       5    162              35        2   
1         7      14    224              48

In [5]:
#checking the columns

print(data.columns)

In [6]:
#checking the information of about dataset

print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120 entries, 0 to 119
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Impressions     120 non-null    int64 
 1   From Home       120 non-null    int64 
 2   From Hashtags   120 non-null    int64 
 3   From Explore    120 non-null    int64 
 4   From Other      120 non-null    int64 
 5   Saves           120 non-null    int64 
 6   Comments        120 non-null    int64 
 7   Shares          120 non-null    int64 
 8   Likes           120 non-null    int64 
 9   Profile Visits  120 non-null    int64 
 10  Follows         120 non-null    int64 
 11  Caption         120 non-null    object
 12  Hashtags        120 non-null    object
dtypes: int64(11), object(2)
memory usage: 12.3+ KB
None


In [7]:
#checking the statistical summary of data

print(data.describe())

        Impressions    From Home  From Hashtags  From Explore   From Other  \
count    120.000000    120.00000     120.000000    120.000000   120.000000   
mean    5914.958333   2472.70000    1896.800000   1167.158333   182.983333   
std     5348.446502   1483.50142    1879.183314   2778.898461   316.280855   
min     1941.000000   1133.00000     116.000000      0.000000     9.000000   
25%     3473.500000   1945.00000     735.500000    157.750000    38.500000   
50%     4293.500000   2201.00000    1278.000000    327.500000    74.500000   
75%     6168.000000   2599.75000    2389.250000    711.500000   205.000000   
max    36919.000000  13473.00000   11817.000000  17414.000000  2547.000000   

             Saves    Comments      Shares       Likes  Profile Visits  \
count   120.000000  120.000000  120.000000  120.000000      120.000000   
mean    152.533333    6.650000    9.458333  176.500000       51.266667   
std     155.892439    3.532918   10.102746   87.270178       87.008908   
m

In [8]:
#find null values from the data

print(data.isnull().sum())

Impressions       0
From Home         0
From Hashtags     0
From Explore      0
From Other        0
Saves             0
Comments          0
Shares            0
Likes             0
Profile Visits    0
Follows           0
Caption           0
Hashtags          0
dtype: int64


In [9]:
#exploring the data starting with number of shares

fig = px.histogram(data,
                   x='Shares',
                   nbins=10,
                   title='Distribution of Shares')
fig.show()

In [10]:
#check number of impressions on each post over time

fig = px.line(data, x= data.index,
              y='Impressions',
              title='Impressions Over Time')
fig.show()

In [11]:
#check the metrics considering likes,saves and follows

fig = go.Figure()

fig.add_trace(go.Scatter(x=data.index, y=data['Likes'], name='Likes'))
fig.add_trace(go.Scatter(x=data.index, y=data['Saves'], name='Saves'))
fig.add_trace(go.Scatter(x=data.index, y=data['Follows'], name='Follows'))

fig.update_layout(title='Metrics Over Time',
                  xaxis_title='Date',
                  yaxis_title='Count')

fig.show()

In [12]:
#distribution of reach from multiple sources

reach_sources = ['From Home', 'From Hashtags', 'From Explore', 'From Other']
reach_counts = [data[source].sum() for source in reach_sources]

colors = ['#FDFFB6', '#FAD1FA', '#D9EDF8', '#DEDAF4']

fig = px.pie(data_frame=data, names=reach_sources,
             values=reach_counts,
             title='Reach from Multiple Sources',
             color_discrete_sequence=colors)
fig.show()

In [13]:
#distribution of user engagement from multiple sources

engagement_metrics = ['Saves', 'Comments', 'Shares', 'Likes']
engagement_counts = [data[metric].sum() for metric in engagement_metrics]

colors = ['#FD8A8A', '#E9A1D4', '#F1F7B5', '#A8D1D1']

fig = px.pie(data_frame=data, names=engagement_metrics,
             values=engagement_counts,
             title='Users Engagement Sources',
             color_discrete_sequence=colors)
fig.show()

In [14]:
#finding the relationship between

fig = px.scatter(data,
                 x='Likes',
                 y='Follows',
                 trendline = 'ols',
                 title='Likes vs. Follows')
fig.show()

In [16]:
#hashtags used in the posts using WordCloud

from wordcloud import WordCloud

hashtags = ' '.join(data['Hashtags'].astype(str))
wordcloud = WordCloud().generate(hashtags)

fig = px.imshow(wordcloud, title='Hashtags Word Cloud')
fig.show()

In [20]:
#correlation between features

# Select only numerical columns for correlation calculation
numerical_data = data.select_dtypes(include=['number'])

corr_matrix = numerical_data.corr()

fig = go.Figure(data=go.Heatmap(z=corr_matrix.values,
                               x=corr_matrix.columns,
                               y=corr_matrix.index,
                               colorscale='RdBu',
                               zmin=-1,
                               zmax=1))

fig.update_layout(title='Correlation Matrix',
                  xaxis_title='Features',
                  yaxis_title='Features')

fig.show()

In [21]:
#distribution of hastags

#list to store all hashtags
all_hashtags = []

#iterating through each row in the 'Hashtags' column
for row in data['Hashtags']:
    hashtags = str(row).split()
    hashtags = [tag.strip() for tag in hashtags]
    all_hashtags.extend(hashtags)

#creating a dataframe to store the hashtag distribution
hashtag_distribution = pd.Series(all_hashtags).value_counts().reset_index()
hashtag_distribution.columns = ['Hashtag', 'Count']

fig = px.bar(hashtag_distribution, x='Hashtag',
             y='Count', title='Distribution of Hashtags')
fig.show()

In [22]:
#creating a dictionary to store the likes and impressions for each hashtag
hashtag_likes = {}
hashtag_impressions = {}

#iterating it through each row in the dataset
for index, row in data.iterrows():
    hashtags = str(row['Hashtags']).split()
    for hashtag in hashtags:
        hashtag = hashtag.strip()
        if hashtag not in hashtag_likes:
            hashtag_likes[hashtag] = 0
            hashtag_impressions[hashtag] = 0
        hashtag_likes[hashtag] += row['Likes']
        hashtag_impressions[hashtag] += row['Impressions']

#creating a dataframe for likes distribution
likes_distribution = pd.DataFrame(list(hashtag_likes.items()), columns=['Hashtag', 'Likes'])

#creating a dataframe for impressions distribution
impressions_distribution = pd.DataFrame(list(hashtag_impressions.items()), columns=['Hashtag', 'Impressions'])

fig_likes = px.bar(likes_distribution, x='Hashtag', y='Likes',
                   title='Likes Distribution for Each Hashtag')

fig_impressions = px.bar(impressions_distribution, x='Hashtag',
                         y='Impressions',
                         title='Impressions Distribution for Each Hashtag')

fig_likes.show()
fig_impressions.show()