In [4]:
import pandas as pd
from collections import Counter
import re
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"

queries_df = pd.read_csv("/content/search_query_anomaly.csv")
print(queries_df.head())

                                 Top queries  Clicks  Impressions     CTR  \
0                number guessing game python    5223        14578  35.83%   
1                        thecleverprogrammer    2809         3456  81.28%   
2           python projects with source code    2077        73380   2.83%   
3  classification report in machine learning    2012         4959  40.57%   
4                      the clever programmer    1931         2528  76.38%   

   Position  
0      1.61  
1      1.02  
2      5.94  
3      1.28  
4      1.09  


In [5]:
print(queries_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Top queries  1000 non-null   object 
 1   Clicks       1000 non-null   int64  
 2   Impressions  1000 non-null   int64  
 3   CTR          1000 non-null   object 
 4   Position     1000 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 39.2+ KB
None


In [6]:
# Cleaning CTR column
queries_df['CTR'] = queries_df['CTR'].str.rstrip('%').astype('float') / 100

In [7]:
# Function to clean and split the queries into words
def clean_and_split(query):
    words = re.findall(r'\b[a-zA-Z]+\b', query.lower())
    return words

# Split each query into words and count the frequency of each word
word_counts = Counter()
for query in queries_df['Top queries']:
    word_counts.update(clean_and_split(query))

word_freq_df = pd.DataFrame(word_counts.most_common(20), columns=['Word', 'Frequency'])

# Plotting the word frequencies
fig = px.bar(word_freq_df, x='Word', y='Frequency', title='Top 20 Most Common Words in Search Queries')
fig.show()

In [8]:
# Top queries by Clicks and Impressions
top_queries_clicks_vis = queries_df.nlargest(10, 'Clicks')[['Top queries', 'Clicks']]
top_queries_impressions_vis = queries_df.nlargest(10, 'Impressions')[['Top queries', 'Impressions']]

# Plotting
fig_clicks = px.bar(top_queries_clicks_vis, x='Top queries', y='Clicks', title='Top Queries by Clicks')
fig_impressions = px.bar(top_queries_impressions_vis, x='Top queries', y='Impressions', title='Top Queries by Impressions')
fig_clicks.show()
fig_impressions.show()

In [9]:
# Queries with highest and lowest CTR
top_ctr_vis = queries_df.nlargest(10, 'CTR')[['Top queries', 'CTR']]
bottom_ctr_vis = queries_df.nsmallest(10, 'CTR')[['Top queries', 'CTR']]

# Plotting
fig_top_ctr = px.bar(top_ctr_vis, x='Top queries', y='CTR', title='Top Queries by CTR')
fig_bottom_ctr = px.bar(bottom_ctr_vis, x='Top queries', y='CTR', title='Bottom Queries by CTR')
fig_top_ctr.show()
fig_bottom_ctr.show()

In [10]:
# Correlation matrix visualization
correlation_matrix = queries_df[['Clicks', 'Impressions', 'CTR', 'Position']].corr()
fig_corr = px.imshow(correlation_matrix, text_auto=True, title='Correlation Matrix')
fig_corr.show()

In [11]:
from sklearn.ensemble import IsolationForest

# Selecting relevant features
features = queries_df[['Clicks', 'Impressions', 'CTR', 'Position']]

# Initializing Isolation Forest
iso_forest = IsolationForest(n_estimators=100, contamination=0.01)  # contamination is the expected proportion of outliers

# Fitting the model
iso_forest.fit(features)

# Predicting anomalies
queries_df['anomaly'] = iso_forest.predict(features)

# Filtering out the anomalies
anomalies = queries_df[queries_df['anomaly'] == -1]

In [12]:
print(anomalies[['Top queries', 'Clicks', 'Impressions', 'CTR', 'Position']])

                          Top queries  Clicks  Impressions     CTR  Position
0         number guessing game python    5223        14578  0.3583      1.61
1                 thecleverprogrammer    2809         3456  0.8128      1.02
2    python projects with source code    2077        73380  0.0283      5.94
4               the clever programmer    1931         2528  0.7638      1.09
7         python turtle graphics code    1455        13585  0.1071      4.60
15         rock paper scissors python    1111        35824  0.0310      7.19
21              classification report     933        39896  0.0234      7.53
34           machine learning roadmap     708        42715  0.0166      8.97
82                           r2 score     367        56322  0.0065      9.33
929                     python turtle      52        18228  0.0029     18.75


In [14]:
from sklearn.cluster import KMeans

# Selecting relevant features
features = queries_df[['Clicks', 'Impressions', 'CTR', 'Position']]

# Initializing K-Means
kmeans = KMeans(n_clusters=3, random_state=42)

# Fitting the model
kmeans.fit(features)

# Adding cluster labels to the DataFrame
queries_df['cluster'] = kmeans.labels_


In [15]:
# Group the DataFrame by the new 'cluster' column and describe the features
cluster_summary = queries_df.groupby('cluster')[['Clicks', 'Impressions', 'CTR', 'Position']].describe()
print(cluster_summary)

        Clicks                                                                \
         count        mean         std    min    25%    50%      75%     max   
cluster                                                                        
0        961.0  155.617066  210.960445   48.0   64.0   91.0   157.00  2809.0   
1         32.0  531.500000  921.626823   52.0  115.0  305.0   495.25  5223.0   
2          7.0  817.000000  649.783810  177.0  356.5  708.0  1022.00  2077.0   

        Impressions                ...      CTR         Position            \
              count          mean  ...      75%     max    count      mean   
cluster                            ...                                       
0             961.0   1196.543184  ...  0.34570  0.8548    961.0  3.806951   
1              32.0  14566.531250  ...  0.03665  0.3583     32.0  8.646250   
2               7.0  46208.428571  ...  0.02585  0.0310      7.0  7.252857   

                                                  

In [19]:
# Example of a scatter plot using two features and coloring by cluster
fig_cluster = px.scatter(queries_df, x='Clicks', y='Impressions', color='cluster', title='Clusters of Search Queries by Clicks and Impressions')
fig_cluster.show()

In [20]:
# Display queries belonging to a specific cluster (e.g., cluster 0)
print(queries_df[queries_df['cluster'] == 0].head())

                                 Top queries  Clicks  Impressions     CTR  \
1                        thecleverprogrammer    2809         3456  0.8128   
3  classification report in machine learning    2012         4959  0.4057   
4                      the clever programmer    1931         2528  0.7638   
5        standard scaler in machine learning    1559         7292  0.2138   
6                               aman kharwal    1490         5752  0.2590   

   Position  anomaly  cluster  
1      1.02       -1        0  
3      1.28        1        0  
4      1.09       -1        0  
5      1.53        1        0  
6      3.75        1        0  


In [21]:
from sklearn.metrics import silhouette_score

# Calculate the silhouette score
# Note: This requires the original features used for clustering
silhouette_avg = silhouette_score(features, queries_df['cluster'])
print(f"The average silhouette_score is : {silhouette_avg}")

The average silhouette_score is : 0.869127943548297
