<!-- This notebook cleans out duplicates, fixes numbers and consolidates a group of spreadsheets -->
This notebook consolidates spreadsheets

### Combine CSV Files

Reviews CSVs were created in two batches due to issues encountered during scraping.  Code in this section combines the related file fragments.

In [9]:
# Libraries Needed
import numpy as np
import pandas as pd

In [10]:
# check we have the files in our directory   ## syntax is for mac
!ls -la ./scraping_file_inputs/*.csv

-rw-r--r--@ 1 mitchmac  staff    80418 Jul 30 01:14 ./scraping_file_inputs/cr_computer_NoRvws.csv
-rw-r--r--@ 1 mitchmac  staff    12942 Jul 30 01:47 ./scraping_file_inputs/cr_computer_NoRvws_ex.csv
-rw-r--r--@ 1 mitchmac  staff   245710 Jul 30 01:14 ./scraping_file_inputs/cr_computer_rvws.csv
-rw-r--r--@ 1 mitchmac  staff    19975 Jul 30 01:47 ./scraping_file_inputs/cr_computer_rvws_ex.csv
-rw-r--r--@ 1 mitchmac  staff    47391 Jul 27 17:46 ./scraping_file_inputs/cr_reviewPageURLs_AllComputers.csv
-rw-r--r--@ 1 mitchmac  staff  1775828 Jul 28 02:07 ./scraping_file_inputs/cr_spider_specs.csv


In [11]:
# Get Source Data
# begin coding here ...

fileList = ["./scraping_file_inputs/cr_computer_NoRvws.csv",
            "./scraping_file_inputs/cr_computer_NoRvws_ex.csv", 
            "./scraping_file_inputs/cr_computer_rvws.csv",
            "./scraping_file_inputs/cr_computer_rvws_ex.csv"]

In [12]:
def build_df_from_csvList(fileLst, drop_dupes = True):
    naList = ["", " ", "  ", "?"]
    df = pd.read_csv(fileLst[0], na_values=naList)
    print("initial df has ", len(df), " rows.")
    print("drop_dupes = " + str(drop_dupes))
    print("Combining These files:")
    print("\t", fileLst[0], sep="")
    for iFile in fileLst[1:]:
        if drop_dupes == True:
            print("\t", iFile, sep="")
            df = df.append(pd.read_csv(iFile, na_values=naList).drop_duplicates())
        else:
            print("\t", iFile, sep="")
            df = df.append(pd.read_csv(iFile, na_values=naList))
    print("df created with ", len(df), " rows.", sep="")
    return df.reset_index()
    # Frame = Frame.append(pandas.DataFrame(data = SomeNewLineOfData))

In [15]:
df_Reviews = build_df_from_csvList(fileList[2:])  # combine last two files and drop dupes along the way

initial df has  173  rows.
drop_dupes = True
Combining These files:
	./scraping_file_inputs/cr_computer_rvws.csv
	./scraping_file_inputs/cr_computer_rvws_ex.csv
df created with 190 rows.


In [16]:
df_Noreviews = build_df_from_csvList(fileList[:2])  # get first two records

initial df has  224  rows.
drop_dupes = True
Combining These files:
	./scraping_file_inputs/cr_computer_NoRvws.csv
	./scraping_file_inputs/cr_computer_NoRvws_ex.csv
df created with 254 rows.


In [20]:
print(df_Noreviews.columns.values.tolist())
print("-"*72)
print(df_Reviews.columns.values.tolist())

['index', 'brand', 'model', 'prod_class', 'prod_short_descr', 'has_reviews', 'url', 'reported_exceptions']
------------------------------------------------------------------------
['index', 'brand', 'model', 'prod_class', 'prod_short_descr', 'num_usr_reviews', 'rec_rvw_rating', 'has_reviews', 'survey_cons', 'survey_pros', 'ratings_distribution', 'user_rating', 'user_rating_txt', 'user_rvw_hdln', 'submit_dateTime', 'submit_date', 'submit_time', 'rvw_username', 'rvw_userLocation', 'rvw_userCity', 'rvw_userState', 'rvw_userCountry', 'verified_buyer', 'verified_reviewer', 'user_review_content', 'btm_line_txt', 'url', 'reported_exceptions']


### Write the Result to New Files

In [17]:
df_Reviews.to_csv('cr_reviewsPage_recs_w_Rvws.csv')

In [18]:
df_Noreviews.to_csv('cr_reviewsPage_recs_w_NoRvws.csv')

In [23]:
df_Reviews.describe()

Unnamed: 0,index,num_usr_reviews,rec_rvw_rating,user_rating,user_rating_txt,verified_buyer
count,190.0,85.0,85.0,190.0,0.0,0.0
mean,79.021053,2.258824,3.163529,3.321053,,
std,52.757001,2.071076,1.312563,1.579324,,
min,0.0,1.0,1.0,1.0,,
25%,30.25,1.0,2.0,2.0,,
50%,77.5,1.0,3.0,4.0,,
75%,124.75,3.0,4.0,5.0,,
max,172.0,13.0,5.0,5.0,,


In [24]:
df_Noreviews.describe()  # nothing useful here yet

Unnamed: 0,index
count,254.0
mean,100.043307
std,68.518441
min,0.0
25%,33.25
50%,96.5
75%,159.75
max,223.0


In [25]:
# sample:
df_Reviews['rec_rvw_rating'].value_counts()

3.0    15
5.0    13
4.0    12
1.0    12
2.0    11
2.5     3
4.3     3
3.7     3
4.8     2
2.2     2
3.5     2
2.4     1
4.5     1
1.7     1
4.4     1
3.4     1
3.9     1
4.1     1
Name: rec_rvw_rating, dtype: int64

In [None]:
## note:  reviews file modified by R scripts after creation of this code
##        * live file now has a "ReviewsGood" column for use in extracting word_cloud content

### Few Old Notes To Refer to
Code commented out ... used in other parts of this project

In [None]:
# len(df)

In [None]:
# tried list comp and it did not seem to work
# this was used on scrapy code in case useful here;

# tstLst = []
# for index, row in df.iterrows():
#     tstLst.append(row['url'] + "specs")
# print(len(tstLst))
# tstLst[0]