In [None]:
import numpy as np 
import pandas as pd

In [None]:
df=pd.read_csv("/kaggle/input/google-play-store/google_play_store_dataset.csv")

In [None]:
print(df.shape)

In [None]:
print(df.columns)
print(df.head())

In [None]:
df.isnull().sum()

In [None]:
df.select_dtypes(include=['number']).var()

In [None]:
df.info()

In [None]:
df = df.drop_duplicates()

In [None]:
df.shape

In [None]:
df['Type'].fillna(
    df.apply(lambda x: 'Paid' if str(x['Price']).replace('$','').replace(',','').strip() not in 
             ['0','0.0','Free','NaN','nan',''] else 'Free', axis=1), inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna(subset=['Content Rating'])

In [None]:
df.isnull().sum()

In [None]:
df.drop(['Current Ver', 'Android Ver'], axis=1, inplace=True)

In [None]:
df.columns

In [None]:
print(df['Rating'].describe())
print(df['Content Rating'].describe())
print(df['Content Rating'].value_counts())

In [None]:
df['Rating'].mean()

In [None]:
df["Category"].value_counts()

In [None]:


from sklearn.ensemble import RandomForestRegressor
import warnings


warnings.filterwarnings('ignore')

print(df.isnull().sum())


df = df[df['Category'] != '1.9'].copy()
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')


df['Reviews'] = pd.to_numeric(df['Reviews'].astype(str), errors='coerce')

df['Installs'] = df['Installs'].astype(str).str.replace(r'[,+]', '', regex=True)
df['Installs'] = pd.to_numeric(df['Installs'], errors='coerce')

df['Price'] = df['Price'].astype(str).str.replace('$', '', regex=False)
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')


df['Size'] = df['Size'].apply(lambda x: 
    float(str(x).replace('M', '')) if 'M' in str(x) 
    else (float(str(x).replace('k', '')) / 1024 if 'k' in str(x) 
    else np.nan)
)




for col in ['Type', 'Content Rating', 'Genres', 'Category']:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].mode()[0])

df_processed = pd.get_dummies(df, columns=['Category', 'Type', 'Content Rating', 'Genres'])


feature_columns = df_processed.columns.drop(['Rating', 'App', 'Last Updated'], errors='ignore')

df_train = df_processed[df_processed['Rating'].notnull()]
X_train = df_train[feature_columns]
y_train = df_train['Rating']

df_to_predict = df_processed[df_processed['Rating'].isnull()]
X_predict = df_to_predict[feature_columns]


X_train = X_train.fillna(0)
X_predict = X_predict.fillna(0)



model = RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1, min_samples_leaf=5)
model.fit(X_train, y_train)


if not X_predict.empty:
    predicted_ratings = model.predict(X_predict)
    df.loc[df['Rating'].isnull(), 'Rating'] = np.round(predicted_ratings, 1)
print("'Rating' imputation complete.")

In [None]:
df.isnull().sum()

In [None]:

df['Size'] = df['Size'].fillna(df.groupby('Category')['Size'].transform('median'))
df['Size'] = df['Size'].fillna(df['Size'].median())

In [None]:
df.isnull().sum()

In [None]:
df["App"].describe()

In [None]:

df['Last_Updated'] = pd.to_datetime(df['Last Updated'])
df['Update_Year'] = df['Last_Updated'].dt.year
df['Update_Month'] = df['Last_Updated'].dt.month
df['Days_Since_Update'] = (pd.Timestamp.now(tz='UTC') - df['Last_Updated'].dt.tz_localize('UTC')).dt.days




df['Engagement_Rate'] = np.where(df['Installs'] > 0, df['Reviews'] / df['Installs'], 0)



bins = [0, 10000, 1000000, 100000000, 1000000000, 10000000001]
labels = ['Low (0-10k)', 'Medium (10k-1M)', 'High (1M-100M)', 'Very High (100M-1B)', 'Massive (1B+)']
df['Install_Tier'] = pd.cut(df['Installs'], bins=bins, labels=labels, right=False)




df[['Primary_Genre', 'Secondary_Genre']] = df['Genres'].str.split(';', n=1, expand=True)
df['Secondary_Genre'] = df['Secondary_Genre'].fillna('None')



df.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
installs_numeric = pd.to_numeric(df['Installs'].astype(str).str.replace(r'[,+]', '', regex=True), errors='coerce')
installs_numeric.name = 'Installs_Numeric'


temp_df = df[['Category']].join(installs_numeric)

category_installs = temp_df.groupby('Category')['Installs_Numeric'].sum().sort_values(ascending=False)


print("Top 15 Categories by Total Installs:")
print(category_installs.head(15))


plt.figure(figsize=(10, 8))

category_installs.head(15).sort_values(ascending=True).plot(kind='barh')

plt.title('Top 15 App Categories by Total Installs')
plt.xlabel('Total Installs (in Billions)')
plt.ylabel('Category')
plt.tight_layout()
plt.savefig('category_installs_plot.png')

print("\nGenerated 'category_installs_plot.png'")

In [None]:

bins = [0, 10000, 1000000, 100000000, 1000000000, 10000000001]
labels = ['Low (0-10k)', 'Medium (10k-1M)', 'High (1M-100M)', 'Very High (100M-1B)', 'Massive (1B+)']


df['Install_Tier'] = pd.cut(df['Installs'], bins=bins, labels=labels, right=False)

mean_ratings = df.groupby('Install_Tier')['Rating'].mean().dropna().reset_index()

plt.figure(figsize=(10, 6))
ax = sns.barplot(
    x='Install_Tier',
    y='Rating',
    data=mean_ratings,
    palette='Blues_d'
)

plt.title('Average App Rating vs. Install Tiers', fontsize=16)
plt.xlabel('Install Tiers', fontsize=12)
plt.ylabel('Average Rating (1-5)', fontsize=12)

plt.ylim(3.5, 4.5)


for p in ax.patches:
    ax.annotate(
        f'{p.get_height():.2f}', 
        (p.get_x() + p.get_width() / 2., p.get_height()), 
        ha='center', va='center', 
        xytext=(0, 9), 
        textcoords='offset points'
    )

plt.tight_layout()
plt.savefig('rating_vs_installs_barchart_CLEAN.png')

print("Generated 'rating_vs_installs_barchart_CLEAN.png'")


df = df.drop(columns=['Install_Tier'])

In [None]:
rating_comparison = df.groupby('Type')['Rating'].agg(['mean', 'median'])
rating_comparison = rating_comparison.reset_index()


rating_melted = rating_comparison.melt(id_vars='Type', value_vars=['mean', 'median'], 
                                       var_name='Metric', value_name='Rating')


plt.figure(figsize=(8, 6))
sns.barplot(x='Type', y='Rating', hue='Metric', data=rating_melted, palette='muted')

plt.title('Mean vs. Median Ratings: Free vs. Paid', fontsize=14)
plt.ylim(3.8, 4.6) 
plt.ylabel('Rating')
plt.xlabel('App Type')

# Add labels
for container in plt.gca().containers:
    plt.gca().bar_label(container, fmt='%.2f')

plt.show()

In [None]:
rating_comparison = df.groupby('Type')['Rating'].agg(['mean', 'median'])
rating_comparison['difference'] = rating_comparison['mean'] - rating_comparison['median']
print(rating_comparison)

In [None]:

print("--- Analyzing Ratings for Free vs. Paid Apps ---")


plot_df = df[df['Type'].isin(['Free', 'Paid'])]


grouped_stats = plot_df.groupby('Type')['Rating'].agg(['mean', 'median', 'count'])
print(grouped_stats)


plt.figure(figsize=(7, 5))
ax = sns.barplot(
    x='Type',
    y='Rating',
    data=plot_df,
    palette='Pastel1'
)

plt.title('Average Rating: Free vs. Paid Apps', fontsize=16)
plt.xlabel('App Type', fontsize=12)
plt.ylabel('Average Rating (1-5)', fontsize=12)


plt.ylim(3.5, 4.5)


ax.bar_label(ax.containers[0], fmt='%.2f', label_type='edge', padding=5)

plt.tight_layout()
plt.savefig('free_vs_paid_barchart.png')

print("\nGenerated 'free_vs_paid_barchart.png'")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select only numeric columns for correlation
# Ensure 'Installs', 'Price', 'Reviews', 'Size' are numeric as per your cleaning
numeric_cols = ['Rating', 'Reviews', 'Size', 'Installs', 'Price', 'Days_Since_Update']

# Calculate correlation
corr = df[numeric_cols].corr()

# Plot Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix of App Metrics')
plt.show()

# Insight print
print("Correlation with Installs:\n", corr['Installs'].sort_values(ascending=False))

In [None]:
# Group by Content Rating to see average Installs and Rating
content_rating_stats = df.groupby('Content Rating').agg({
    'Installs': 'mean',
    'Rating': 'mean',
    'App': 'count'
}).sort_values(by='Installs', ascending=False)

print(content_rating_stats)

# Visualization
plt.figure(figsize=(10, 6))
sns.barplot(x=content_rating_stats.index, y='Installs', data=content_rating_stats, palette='magma')
plt.title('Average Installs by Content Rating')
plt.ylabel('Average Installs')
plt.xlabel('Target Audience')
plt.xticks(rotation=45)
plt.show()

In [None]:
paid_apps = df[df['Type'] == 'Paid'].copy()

# Create price bins
bins = [0, 2, 5, 10, 20, 50, 200]
labels = ["0‚Äì2$", "2‚Äì5$", "5‚Äì10$", "10‚Äì20$", "20‚Äì50$", "50$+"]

paid_apps["Price_Range"] = pd.cut(paid_apps["Price"], bins=bins, labels=labels, right=False)

plt.figure(figsize=(10, 6))
sns.boxplot(x="Price_Range", y="Rating", data=paid_apps, palette="Blues")
plt.title("Rating Distribution by Price Range (Paid Apps)")
plt.xlabel("Price Range")
plt.ylabel("Rating (1‚Äì5)")
plt.ylim(1, 5)
plt.show()


In [None]:
review=pd.read_csv("/kaggle/input/google-play-store/googleplaystore_user_reviews.csv")

In [None]:
review.shape

In [None]:
cols_to_check = [col for col in review.columns if col != 'App']

review_clean = review.dropna(subset=cols_to_check, how='all')

In [None]:
review_clean.shape

In [None]:
review_clean.isnull().sum()

In [None]:
review_clean = review_clean.dropna(subset=['Translated_Review'])

In [None]:
review_clean.shape

In [None]:
review_clean = review_clean.drop_duplicates(subset=['App', 'Translated_Review'])

In [None]:
review_clean.shape

In [None]:
import re

# 2. Text Normalization Function
def normalize_text(text):
    # Ensure text is a string
    text = str(text)
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters (keep only a-z and spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

review_clean['Cleaned_Review'] = review_clean['Translated_Review'].apply(normalize_text)

print("Preprocessing Complete.")
print(f"Final Row Count: {len(review_clean)}")
print(review_clean[['Translated_Review', 'Cleaned_Review']].head())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Set the visual style for professional charts
sns.set_style("whitegrid")

# CHART 1: THE SENTIMENT DISTRIBUTION (Slide 2)
plt.figure(figsize=(7, 7))
counts = review_clean['Sentiment'].value_counts()
plt.pie(counts, labels=counts.index, autopct='%1.1f%%', 
        startangle=140, colors=['#66b3ff', '#ff9999', '#99ff99'],
        explode=(0.05, 0, 0)) # Explode the biggest slice
plt.title('Overall Sentiment Distribution')
plt.show()

# CHART 2: SUBJECTIVITY VS POLARITY (Slide 4)
plt.figure(figsize=(10, 6))
sns.scatterplot(data=review_clean, x='Sentiment_Polarity', y='Sentiment_Subjectivity', 
                hue='Sentiment', alpha=0.5, palette='viridis')
plt.axvline(0, color='grey', linestyle='--', linewidth=1)
plt.title('Sentiment Analysis: Subjectivity (Fact) vs. Polarity (Emotion)')
plt.xlabel('Polarity (Negative <---> Positive)')
plt.ylabel('Subjectivity (Objective <---> Subjective)')
plt.legend(title='Sentiment Type')
plt.show()

# CHART 3: THE NEGATIVE WORD CLOUD (Slide 5)
neg_text = " ".join(review for review in review_clean[review_clean['Sentiment'] == 'Negative']['Cleaned_Review'])

# Generate cloud
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='Reds').generate(neg_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Most Frequent Words in Negative Reviews')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# 1. Identify the Top 3 Apps by Review Count
top_3_apps = review_clean['App'].value_counts().head(3).index.tolist()
print(f"--- The Top 3 Most Reviewed Apps are: ---\n{top_3_apps}\n")

# Set up the visualization layout (2 Rows, 3 Columns)
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
plt.subplots_adjust(hspace=0.4, wspace=0.3)

# Loop through each app to generate its specific charts
for i, app_name in enumerate(top_3_apps):
    
    # Filter data for this specific app
    app_data = review_clean[review_clean['App'] == app_name]
    
    # --- STATS PRINTING (For your PPT text) ---
    print(f"=== {app_name} Stats ===")
    print(f"Total Reviews: {len(app_data)}")
    print(f"Mean Polarity: {app_data['Sentiment_Polarity'].mean():.2f}")
    print(f"Mean Subjectivity: {app_data['Sentiment_Subjectivity'].mean():.2f}")
    print("-" * 30)

    # --- CHART 1: Sentiment Distribution (Pie Charts) ---
    # Placed in the Top Row (axes[0, i])
    counts = app_data['Sentiment'].value_counts()
    axes[0, i].pie(counts, labels=counts.index, autopct='%1.1f%%', 
                   startangle=140, colors=['#66b3ff', '#ff9999', '#99ff99'],
                   explode=[0.05 if x == counts.max() else 0 for x in counts])
    axes[0, i].set_title(f"{app_name}\nSentiment Mix")

    # --- CHART 2: Negative Word Clouds (The Specific Complaints) ---
    # Placed in the Bottom Row (axes[1, i])
    
    # Filter for negative reviews
    neg_reviews = app_data[app_data['Sentiment'] == 'Negative']
    
    if not neg_reviews.empty:
        text = " ".join(review for review in neg_reviews['Cleaned_Review'])
        # Create cloud
        wc = WordCloud(width=400, height=300, background_color='white', colormap='Reds').generate(text)
        axes[1, i].imshow(wc, interpolation='bilinear')
        axes[1, i].set_title(f"Negative Keywords for\n{app_name}")
    else:
        axes[1, i].text(0.5, 0.5, "No Negative Reviews", ha='center')
    
    axes[1, i].axis('off')

plt.suptitle('Deep Dive: Top 3 Most Reviewed Apps', fontsize=20)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the length (number of characters) for each review
review_clean['Review_Length'] = review_clean['Cleaned_Review'].apply(len)

# Visual: Box Plot comparing lengths
plt.figure(figsize=(10, 6))
sns.boxplot(x='Sentiment', y='Review_Length', data=review_clean, 
            order=['Positive', 'Neutral', 'Negative'], palette='Set2', showfliers=False)

plt.title('User Effort: Review Length by Sentiment')
plt.ylabel('Number of Characters in Review')
plt.xlabel('Sentiment Type')
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score

# ==========================================
# 1. SETUP DATA
# ==========================================
# Define Success
df['Category_Median'] = df.groupby('Category')['Installs'].transform('median')
df['Success'] = (df['Installs'] > df['Category_Median']).astype(int)

# Select Features (Same as before)
features = ['Category', 'Size', 'Type', 'Price', 'Content Rating']

X = df[features].copy()
y = df['Success']

# ==========================================
# 2. ENCODE & SCALE (CRITICAL FOR KNN)
# ==========================================
encoders = {} 
for col in ['Category', 'Type', 'Content Rating']:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    encoders[col] = le

# SCALE THE DATA (Standardize so Price and Size are equal weight)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ==========================================
# 3. FIND THE BEST 'K' VALUE
# ==========================================
scores = []
k_range = range(1, 21) # Try K from 1 to 20

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))

# VISUALIZE 'ELBOW GRAPH' (For your PPT)
plt.figure(figsize=(10, 6))
plt.plot(k_range, scores, marker='o', linestyle='--', color='blue')
plt.title('Finding the Optimal K (Number of Neighbors)')
plt.xlabel('Value of K')
plt.ylabel('Model Accuracy')
plt.grid(True)
plt.savefig('knn_accuracy_plot.png')
plt.show()

# ==========================================
# 4. TRAIN FINAL MODEL WITH BEST K
# ==========================================
best_k = k_range[np.argmax(scores)] # Pick the K with highest accuracy
print(f"Optimal K found: {best_k}")

knn_model = KNeighborsClassifier(n_neighbors=best_k)
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)
print(f"KNN Model Accuracy: {accuracy_score(y_test, y_pred):.1%}")

In [None]:
def predict_with_knn(category, size, app_type, price, age_rating):
    
    # 1. Prepare Input
    input_df = pd.DataFrame({
        'Category': [category],
        'Size': [size],
        'Type': [app_type],
        'Price': [price],
        'Content Rating': [age_rating]
    })
    
    # 2. Encode
    try:
        for col in ['Category', 'Type', 'Content Rating']:
            input_df[col] = encoders[col].transform(input_df[col])
    except:
        print("Error: Invalid Category/Rating.")
        return

    # 3. SCALE (Must use the same scaler as training!)
    input_scaled = scaler.transform(input_df)

    # 4. Predict
    prediction = knn_model.predict(input_scaled)[0]
    prob = knn_model.predict_proba(input_scaled)[0][1]

    print(f"üì± KNN ANALYSIS FOR: {category} ({age_rating}) | {size}MB")
    
    if prediction == 1:
        print(f"   ‚úÖ PREDICTION: HIT (Confidence: {prob:.1%})")
        print("   Logic: Your app is mathematically similar to other successful apps.")
    else:
        print(f"   ‚ùå PREDICTION: FLOP (Confidence: {prob:.1%})")
        print("   Logic: Your app is mathematically similar to failing apps.")

    print("-" * 40)
    
    # 5. FIND NEIGHBORS (The unique KNN feature)
    # Let's find the actual apps in the dataset that look like this new idea
    distances, indices = knn_model.kneighbors(input_scaled, n_neighbors=3)
    
    print("üîç NEAREST NEIGHBORS (Existing apps closest to your idea):")
    for i in indices[0]:
        # Look up the original app name from the original dataframe
        # Note: We need to use df.iloc[i] but mapped correctly to X_train. 
        # For simplicity in this demo, we assume indices align with the original df subset used.
        similar_app_success = df.iloc[i]['Success']
        print(f"   ‚Ä¢ Neighbor #{i}: {'Hit' if similar_app_success==1 else 'Flop'}")

# TEST CASES
predict_with_knn('GAME', 95.0, 'Paid', 2.99, 'Everyone')
predict_with_knn('TOOLS', 10.0, 'Free', 0, 'Everyone')