# Reading and understanding the data from Amazon Book Reviews Dataset from Kaggle

## Reading the first csv file 

In [2]:
import pandas as pd #importing the library
book_data = pd.read_csv('books_data.csv')
#book_data

## Check the column names and droping the unwanted columns

#### The actual size of these two datasets are too large(more than 1 GB), so dropping unwanted columns can help to load teh data faster

In [4]:
book_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212404 entries, 0 to 212403
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Title          212403 non-null  object 
 1   description    143962 non-null  object 
 2   authors        180991 non-null  object 
 3   publisher      136518 non-null  object 
 4   publishedDate  187099 non-null  object 
 5   categories     171205 non-null  object 
 6   ratingsCount   49752 non-null   float64
dtypes: float64(1), object(6)
memory usage: 11.3+ MB


In [3]:
book_data.drop(columns = ['image' , 'previewLink', 'infoLink'] , inplace = True) 

## Reading the second dataset and dropping the unwanted columns

In [5]:
import pandas as pd
book_reviews = pd.read_csv('Books_rating.csv')
#book_reviews

In [7]:
book_reviews.drop(columns = ['Id', 'User_id', 'profileName', 'review/time' ] , inplace = True) 

In [8]:
book_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 6 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Title               object 
 1   Price               float64
 2   review/helpfulness  object 
 3   review/score        float64
 4   review/summary      object 
 5   review/text         object 
dtypes: float64(2), object(4)
memory usage: 137.3+ MB


## The user entered data might have inconsistent data so changing them to same format

In [9]:
book_data.Title.str.title()

0                            Its Only Art If Its Well Hung!
1                                  Dr. Seuss: American Icon
2                     Wonderful Worship In Smaller Churches
3                             Whispers Of The Wicked Saints
4         Nation Dance: Religion, Identity And Cultural ...
                                ...                        
212399    The Orphan Of Ellis Island (Time Travel Advent...
212400                              Red Boots For Christmas
212401                                                Mamaw
212402                                    The Autograph Man
212403    Student'S Solutions Manual For Johnson/Mowry'S...
Name: Title, Length: 212404, dtype: object

In [10]:
# getting rid of trailing whitespace

book_data.Title.str.strip()

0                            Its Only Art If Its Well Hung!
1                                  Dr. Seuss: American Icon
2                     Wonderful Worship in Smaller Churches
3                             Whispers of the Wicked Saints
4         Nation Dance: Religion, Identity and Cultural ...
                                ...                        
212399    The Orphan Of Ellis Island (Time Travel Advent...
212400                              Red Boots for Christmas
212401                                                Mamaw
212402                                    The Autograph Man
212403    Student's Solutions Manual for Johnson/Mowry's...
Name: Title, Length: 212404, dtype: object

In [11]:
book_reviews.Title.str.title()

0          Its Only Art If Its Well Hung!
1                Dr. Seuss: American Icon
2                Dr. Seuss: American Icon
3                Dr. Seuss: American Icon
4                Dr. Seuss: American Icon
                        ...              
2999995               The Idea Of History
2999996               The Idea Of History
2999997               The Idea Of History
2999998               The Idea Of History
2999999               The Idea Of History
Name: Title, Length: 3000000, dtype: object

In [12]:
book_reviews.Title.str.strip()

0          Its Only Art If Its Well Hung!
1                Dr. Seuss: American Icon
2                Dr. Seuss: American Icon
3                Dr. Seuss: American Icon
4                Dr. Seuss: American Icon
                        ...              
2999995               The Idea of History
2999996               The Idea of History
2999997               The Idea of History
2999998               The Idea of History
2999999               The Idea of History
Name: Title, Length: 3000000, dtype: object

## Understanding the data to normalize them

In [13]:
book_data.describe()

Unnamed: 0,ratingsCount
count,49752.0
mean,21.252975
std,201.340431
min,1.0
25%,1.0
50%,2.0
75%,5.0
max,4895.0


## Before normalizing changing the column names so that all of them are in the same format

In [15]:
#reanaming the columns

book_data.rename(columns = {'Title' : 'title', 'publishedDate' : 'published_date', 'ratingsCount' : 'ratings_count'}, inplace = True)


## Checking if numeric columns have int or float fdatatype and string columns have object datatype

In [16]:
book_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212404 entries, 0 to 212403
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   title           212403 non-null  object 
 1   description     143962 non-null  object 
 2   authors         180991 non-null  object 
 3   publisher       136518 non-null  object 
 4   published_date  187099 non-null  object 
 5   categories      171205 non-null  object 
 6   ratings_count   49752 non-null   float64
dtypes: float64(1), object(6)
memory usage: 11.3+ MB


## Following the same for the 2nd dataset

In [20]:
#reanaming the columns

book_reviews.rename(columns = {'Title' : 'title', 'Price' : 'price', 'review/helpfulness' : 'review_helpfulness', 'review/score' :'review_score', 'review/summary':'review_summary','review/text':'review_text' }, inplace = True)


In [21]:
book_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 6 columns):
 #   Column              Dtype  
---  ------              -----  
 0   title               object 
 1   price               float64
 2   review_helpfulness  object 
 3   review_score        float64
 4   review_summary      object 
 5   review_text         object 
dtypes: float64(2), object(4)
memory usage: 137.3+ MB


## Normalizing the data 

In [22]:
col = ['ratings_count']
book_data[col] = (book_data[col] - book_data[col].min()) / (book_data[col].max() - book_data[col].min())

In [24]:
book_data.describe()

Unnamed: 0,ratings_count
count,49752.0
mean,0.004138
std,0.04114
min,0.0
25%,0.0
50%,0.000204
75%,0.000817
max,1.0


In [25]:
book_reviews.describe()

Unnamed: 0,price,review_score
count,481171.0,3000000.0
mean,21.762656,4.215289
std,26.206541,1.203054
min,1.0,1.0
25%,10.78,4.0
50%,14.93,5.0
75%,23.95,5.0
max,995.0,5.0


In [28]:
col = ['price', 'review_score']
book_reviews[col] = (book_reviews[col] - book_reviews[col].min()) / (book_reviews[col].max() - book_reviews[col].min())

In [29]:
book_reviews.describe()

Unnamed: 0,price,review_score
count,481171.0,3000000.0
mean,0.020888,0.8038223
std,0.026365,0.3007634
min,0.0,0.0
25%,0.009839,0.75
50%,0.014014,1.0
75%,0.023089,1.0
max,1.0,1.0


### Now we can see that the data is normalized, we can merge teh two dataframes

## Merging the dataframes

In [30]:
# merging Books_df.csv and Books_rating.csv

merged_data = pd.merge(book_data, book_reviews, on='title', how='inner')

## Saving the chnages and the merged dataframe to a csv file

In [None]:
# Save the merged data to a new CSV file
merged_data.to_csv('books_and_reviews.csv', index=False)

### We can see that the new data is also normalized

In [2]:
import pandas as pd
newdata = pd.read_csv('books_and_reviews.csv')
newdata.describe()

Unnamed: 0,ratings_count,price,review_score
count,889355.0,230465.0,1583270.0
mean,0.060555,0.020418,0.8054438
std,0.161097,0.025536,0.2995465
min,0.0,0.0,0.0
25%,0.000409,0.009799,0.75
50%,0.002043,0.014014,1.0
75%,0.013895,0.022233,1.0
max,1.0,0.879276,1.0


In [3]:
newdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1583270 entries, 0 to 1583269
Data columns (total 12 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   title               1583062 non-null  object 
 1   description         1241093 non-null  object 
 2   authors             1365062 non-null  object 
 3   publisher           1173688 non-null  object 
 4   published_date      1378140 non-null  object 
 5   categories          1273565 non-null  object 
 6   ratings_count       889355 non-null   float64
 7   price               230465 non-null   float64
 8   review_helpfulness  1583270 non-null  object 
 9   review_score        1583270 non-null  float64
 10  review_summary      1583037 non-null  object 
 11  review_text         1583268 non-null  object 
dtypes: float64(3), object(9)
memory usage: 145.0+ MB
