# Downloading datasets


* first i am gonna install kaggle
* then i am gonna import the data set to the notebook

In [None]:
! pip install kaggle
! pip install pandas
! pip install matplotlib

In [None]:
! kaggle datasets download -d datafiniti/consumer-reviews-of-amazon-products
! kaggle datasets download -d karkavelrajaj/amazon-sales-dataset

In [None]:
import os
import zipfile as zp

zip_file_to_extract = [file for file in os.listdir() if file.endswith('.zip')]
os.makedirs('dataset', exist_ok=True)

for zipFile in zip_file_to_extract:
    with zp.ZipFile(zipFile, 'r') as file:
        file.extractall('dataset')
    os.remove(zipFile)

# Prefroming tasks on the datasets


i am gonna perform 5 tasks on 2 datasets
* handling missing values
* merging datasets
* renaming columns
* creating new columns
* type conversion

! pip install pandas

In [None]:
import pandas as pd

# Handling Missing Values


### for Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products

In [None]:
Amazon_consumer_review = pd.read_csv('dataset/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv')
Amazon_consumer_review.isnull().sum()

The reviews username was missing so i put the most common name used in the whole column instead

In [None]:
Amazon_consumer_review['reviews.username'].fillna(Amazon_consumer_review['reviews.username'].value_counts().idxmax(), inplace=True)
Amazon_consumer_review.isnull().sum()

Now doing the same for reviews title column

In [None]:
Amazon_consumer_review['reviews.title'].fillna(Amazon_consumer_review['reviews.title'].value_counts().idxmax(), inplace= True)
Amazon_consumer_review.isnull().sum()

The column reviews.dateAdded and reviews.id  have missing values greater than >5% so i am gonna drop it

In [None]:
Amazon_consumer_review.drop(['reviews.dateAdded','reviews.id'], axis=1, inplace=True)
Amazon_consumer_review.isnull().sum()

### For amazon.csv

In [None]:
amazon = pd.read_csv('dataset/amazon.csv')
amazon.info()
amazon.isnull().sum()

For the amazon list only 2 rows have missing values so its safe to drop them

In [None]:
amazon.dropna(inplace= True)
amazon.isnull().sum()

# Merging datasets

In [None]:
Amazon_consumer_review.info()


In [None]:
amazon.info()

In [None]:
merge_csv = pd.merge(Amazon_consumer_review, amazon, left_on='id', right_on='product_id', how="left")
merge_csv

In [None]:
merge_csv.to_csv('merged_csv.csv')

# Renaming column in dataset


### Renaming the columns in  Amazon_consumer_review

In [None]:
column_rename_mapping = {
    'id': 'product_id',
    'dateAdded': 'product_added_date',
    'dateUpdated': 'product_last_updated_date',
    'name': 'product_name',
    'asins': 'amazon_asin',
    'brand': 'brand_name',
    'categories': 'product_categories',
    'primaryCategories': 'primary_category',
    'imageURLs': 'product_image_urls',
    'keys': 'product_keys',
    'manufacturer': 'manufacturer_name',
    'manufacturerNumber': 'manufacturer_part_number',
    'reviews.date': 'review_date',
    'reviews.dateAdded': 'review_added_date',
    'reviews.dateSeen': 'review_last_viewed_date',
    'reviews.doRecommend': 'is_recommended',
    'reviews.id': 'review_id',
    'reviews.numHelpful': 'helpful_votes_count',
    'reviews.rating': 'review_rating',
    'reviews.sourceURLs': 'review_source_urls'
}

In [None]:
Amazon_consumer_review.rename(columns=column_rename_mapping, inplace=True, errors='ignore')
Amazon_consumer_review.info()
Amazon_consumer_review.to_csv('renamed amazon consumer review.csv')

### Renaming the columns in amazon dataset

In [None]:
amazon.info()

In [None]:
column_rename_mapping = {
    # Product metadata
    'product_id': 'product_id',  # Already good (keep as-is)
    'product_name': 'product_name',  # Already clear
    'category': 'product_category',
    'img_link': 'product_image_url',
    'product_link': 'product_page_url',
    
    # Pricing information
    'discounted_price': 'current_price',
    'actual_price': 'original_price',
    'discount_percentage': 'discount_pct',
    
    # Ratings/Reviews
    'rating': 'average_rating',
    'rating_count': 'total_ratings',
    'about_product': 'product_description',
    
    # User/Review metadata
    'user_id': 'reviewer_id',
    'user_name': 'reviewer_name',
    'review_id': 'review_id',  # Already good
    'review_title': 'review_title',  # Already clear
    'review_content': 'review_text'
}

In [None]:
amazon.rename(columns=column_rename_mapping, inplace=True, errors='ignore')
amazon.info()
# amazon.to_csv('rename amazon.csv')

# Creating New Columns

### first doing the Amazon_consumer_review 

in this i am gonna add column containing the boolean value if the manufacture and the brand is the same value

In [None]:
Amazon_consumer_review['is manufacture same as brand'] = Amazon_consumer_review['brand_name'].str.lower() == Amazon_consumer_review['manufacturer_name'].str.lower()

now i am gonna add a urgent tab where if the rating of the product is lower than 2 it will raise a flag

In [None]:
Amazon_consumer_review['urgent review'] = (Amazon_consumer_review['review_rating']<=2)

now saving the file as a separate csv

In [None]:
Amazon_consumer_review.to_csv('new columns amazon consumer review.csv')
Amazon_consumer_review

### Now doing it for amazon

adding a column containing the length of te review text

In [None]:
amazon['review length'] = amazon['review_text'].str.len()

In [None]:
amazon.info()

# Type conversion on Columns

### Amazon consumer review

In [None]:
Amazon_consumer_review.info()

In [None]:
Amazon_consumer_review['product_id'] =Amazon_consumer_review['product_id'].astype(str)

### Amazon

In [None]:
amazon.info()

In [None]:
amazon['product_id'] =amazon['product_id'].astype(str)

In [None]:
amazon.info()

## 📊 Visualization: Distribution of Product Ratings

In [7]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load the reviews dataset
reviews_file = [file for file in os.listdir('dataset') if 'consumer' in file.lower() and file.endswith('.csv')]
if reviews_file:
    df = pd.read_csv(os.path.join('dataset', reviews_file[0]))

# Drop NA and parse rating column
df = df.dropna(subset=['rating'])
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df = df.dropna(subset=['rating'])

# Plot rating distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='rating', data=df, palette='viridis')
plt.title('Distribution of Product Ratings')
plt.xlabel('Rating')
plt.ylabel('Number of Reviews')
plt.tight_layout()
plt.show()


ModuleNotFoundError: No module named 'seaborn'