In [1]:
import pandas as pd

In [2]:
# loading the app datasets
app_data = pd.read_csv(r"C:\Users\DELL\Downloads\apps work.csv")

In [3]:
app_data

In [4]:
# Check data types and structure
print(app_data.head())
print(app_data.info())

In [5]:
# Checking data for missing values
print(app_data.isnull().sum())

In [6]:
# handling missing value by dropping the rows
app_data = app_data.dropna(subset = ['Rating'])
app_data = app_data.dropna(subset = ['Size'])
app_data = app_data.dropna(subset = ['Current Ver'])
app_data = app_data.dropna(subset = ['Android Ver'])

In [7]:
print(app_data.isnull().sum())

In [8]:
duplicates = app_data.duplicated()
print(duplicates)

In [9]:
unique_rows = app_data.drop_duplicates()
print(unique_rows)

In [10]:
# Clean 'Installs' column to remove commas and '+' and convert to integers
app_data['Installs'] = app_data['Installs'].str.replace('[+,]', '', regex=True).astype(int)

In [11]:
print(app_data['Installs'])

In [12]:
# Check for NaN values in the entire DataFrame
print(app_data.isna())
print(app_data.isna().sum().sum())

In [13]:
print(app_data['Price'])

In [14]:
# Convert 'Price' column to string, remove '$', and then convert to float
app_data['Price'] = app_data['Price'].astype(str).str.replace('$','',regex=True)

In [15]:
# Convert string to numeric
app_data['Price'] = pd.to_numeric(app_data['Price'], errors='coerce')

In [16]:
print(app_data['Price'])

In [17]:
# Replace non-numeric sizes
app_data['Size'] = app_data['Size'].replace('Varies with device', None)

In [18]:
#  Clean 'Size' column and convert kb to mb
app_data['Size'] = app_data['Size'].astype(str).str.replace('M','', regex=False).str.replace('K','',regex=False)

In [19]:
# Convert to numeric and then to integers
app_data['Size'] = pd.to_numeric(app_data['Size'], errors='coerce').astype(int)

In [20]:
print(app_data['Size'])

In [21]:
# Investigating app distribution across categories. 
category_distribution = app_data.groupby('Category')['App'].count().reset_index()

In [22]:
# Export to CSV for Power BI visualization
category_distribution.to_csv('category_distribution1.csv', index=False)

In [23]:
category_distribution

In [24]:
category_distribution = app_data.groupby('Category')['App'].count().plot(kind='bar', figsize=(10, 6), title='Distribution of Apps across Categories')

In [25]:
print("File saved successfully as 'category_distribution1.csv'")

In [26]:
# Calculate average rating by category
Average_ratings = app_data.groupby('Category')['Rating'].mean().reset_index()

In [27]:
# Export to CSV for Power BI visualization
Average_ratings.to_csv('Average_ratings.csv1', index=False)

In [28]:
Average_ratings

In [29]:
Average_ratings = app_data.groupby('Category')['Rating'].mean().plot(kind='bar', figsize=(10, 6),title='Average Rating by Category')

In [30]:
import seaborn as sns
# Analyzing patterns between metrics (Ratings vs Size)
sns.scatterplot(data = app_data, x='Size', y='Rating')

In [31]:
# Analyzing patterns between price and ratings
sns.boxplot(data = app_data, x='Price', y='Rating')

In [32]:
# Analyze the distribution of app prices (e.g., free vs paid)
price_distribution = app_data['Type'].value_counts()
price_distribution.plot(kind='bar', figsize=(8, 6), title='Distributin of App Prices')

In [33]:
# import matplotlib.pyplot as plt

# Count free and paid apps from the 'Type' column
type_distribution = app_data['Type'].value_counts()

# plot the distribution
type_distribution.plot(kind='pie', autopct='%1.1f%%', color=['skyblue','orange'], figsize=(8, 6), title='Free vs Paid Apps')

In [34]:
# Interactive Visualization with Plotly
import plotly.express as px
fig = px.scatter(app_data, x='Size', y='Rating', color='Category', title='App Ratings vs Size')
fig.show()

In [35]:
# Analyze size trends
size_trend = app_data.groupby('Category')['Size'].mean().reset_index()

In [36]:
# Export to CSV for Power BI visualization
size_trend.to_csv('size_trend.csv1', index=False)

In [37]:
print("File saved successfully as 'size_trend.csv'")

In [38]:
size_trend

In [39]:
# Analyzing pricing trends
price_trend = app_data.groupby('Category')['Price'].mean().reset_index()

In [40]:
# Export to CSV for Power BI visualization
price_trend.to_csv('price_trend.csv1', index=False)

In [41]:
import pandas as pd

# Load the dataset
user_reviews = pd.read_csv(r'C:\Users\DELL\Downloads\user_reviews.csv')

# Inspect the dataset
print(user_reviews.head())
print(user_reviews.info())

In [42]:
user_reviews.isnull().values.any()

In [43]:
# Inspect the dataset for any missing value
print(user_reviews.isnull().sum())
print(user_reviews.isna().sum())

In [44]:
# drop rows with missing value
user_reviews = user_reviews.dropna(subset=['Translated_Review', 'Sentiment', 'Sentiment_Polarity', 'Sentiment_Subjectivity'])

# Rename 'Translated_Review' to 'Review'
user_reviews.rename(columns={'Translated_Review':'Review'}, inplace=True)

In [45]:
# save cleaned user_reviews dataset
user_reviews.to_csv('Cleaned_user_reviews.csv', index=False)

In [46]:
# Import necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')  # Corrected from 'stopword'

# Define a function for text preprocessing
def preprocess_text(text):
    # Ensure review (text) is a string before applying string operations
    if not isinstance(text, str):
        text = str(text)  # Proper indentation for this line
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Rejoin tokens
    return ' '.join(tokens)

# Apply preprocessing to reviews
user_reviews['Processed_Reviews'] = user_reviews['Review'].apply(preprocess_text)

In [47]:
user_reviews['Processed_Reviews']

In [48]:
# Define a function to compute sentiment
def compute_sentiment(text):
    Analysis = TextBlob(text)
    return Analysis.Polarity, Analysis.subjectivity

# Apply sentiment computation
user_reviews['Computed_Polarity'], user_reviews['Computed_Subjectivity'] = \
    zip(*user_reviews['Processed_Reviews'].apply(
        lambda x: (TextBlob(x).sentiment.polarity, TextBlob(x).sentiment.subjectivity)
    ))

In [49]:
print(user_reviews)

In [50]:
# Define mapping for computed polarity to sentiment

def map_polarity_to_sentiment(polarity):
    if polarity > 0:
        return 'Positive'
    elif polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply mapping
user_reviews['Computed_Sentiment'] = user_reviews['Computed_Polarity'].apply(map_polarity_to_sentiment)

In [51]:
# Compare original and computed sentiment
comparison = user_reviews[['Sentiment', 'Computed_Sentiment']].value_counts()
print(comparison)

In [52]:
# plotting scatterplot to compare original and computed sentiment polarity
sns.scatterplot(data=user_reviews, x ='Sentiment_Polarity', y ='Computed_Polarity', hue ='Sentiment')
plt.title("Comparison of Original and Computed Polarity")
plt.xlabel("Original Polarity")
plt.ylabel("Computed Polarity")
plt.show()

In [53]:
# Save the dataset as a CSV file
output_file = 'sentiment_analysis_comparison_results.csv'
user_reviews.to_csv(output_file, index=False)

print(f"Sentiment analysis results saved to {output_file}")

In [132]:
# Count and plot sentiment distribution
sentiment_counts = user_reviews['Sentiment'].value_counts()

plt.figure(figsize=(8, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette='viridis', hue=sentiment_counts.index, legend=True)
plt.title('Distribution of Sentiments', fontsize=16)
plt.xlabel('Sentiment', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.show()

In [136]:
# Data Exploration: Polarity distribution
plt.figure(figsize=(10, 6))
sns.histplot(user_reviews['Sentiment_Polarity'], kde=True, bins=30, color='blue')
plt.title('Distribution of Sentiment Polarity', fontsize=16)
plt.xlabel('Sentiment Polarity', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.show()

In [138]:
# Save the enhanced dataset
user_reviews.to_csv('enhanced_dataset.csv', index=False)

In [140]:
# Calculate average polarity and subjectivity by app
sentiment_summary = user_reviews.groupby('App').agg(
    Avg_Polarity=('Sentiment_Polarity', 'mean'),
    Avg_Subjectivity=('Sentiment_Subjectivity', 'mean')
).reset_index()

# Save for visualization in Power BI
sentiment_summary.to_csv('sentiment_summary.csv', index=False)

In [142]:
top_positive_apps = sentiment_summary.nlargest(10, 'Avg_Polarity')
top_negative_apps = sentiment_summary.nsmallest(10, 'Avg_Polarity')

print("Top Positive Apps:")
print(top_positive_apps)

print("\nTop Negative Apps:")
print(top_negative_apps)

In [144]:
# Save for visualization in Power BI
sentiment_summary.to_csv('sentiment_summary1.csv', index=False)