In [73]:
# Import the libraries to use:
import os
import csv
import pandas as pd

<h1>File Merge:</h1>
In this notebook we will merge the CSV files of each brand into a single PARQUET file.

Function to read each CSV file:

In [74]:
def custom_csv_reader(file_path):
    with open(file_path, newline='', encoding="utf-8") as csvfile:
        
        # Define the appropriate delimiter and quote character for your CSV file
        csv_reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        
        # List to store processed rows
        processed_rows = []
        
        # Iterate over the rows in the CSV file
        for row in csv_reader:
            # Concatenate all columns except those enclosed in quotes
            processed_row = []
            quoted = False  # Flag to track if we're inside quotes
            for item in row:
                if quoted:
                    processed_row[-1] += ',' + item  # Append to the last element
                    if item.endswith('"'):
                        quoted = False  # End quote marker
                else:
                    if item.startswith('"'):
                        if item.endswith('"'):
                            processed_row.append(item[1:-1])  # If completely enclosed in quotes
                        else:
                            processed_row.append(item[1:])  # If starts with quotes
                            quoted = True
                    else:
                        processed_row.append(item)  # If not enclosed in quotes
            processed_rows.append(processed_row)
            
        return processed_rows

Folder scan and loop

In [75]:
# Path of the folder you want to traverse:
directory = 'Dataset\Raw data'

# Get the list of files in the folder:
files = os.listdir(directory)

# List to store records from each file:
all_data = []

# List to store deleted registers:
deleted_registers = []

# Counter:
counter = 0

# Loop:
for file in files:

    counter = counter + 1
    print(f"-----{counter}-----")
    print(f"Working on file: {file}")

    # CSV reading function:
    file_path = f"Dataset\Raw data\{file}"
    data = custom_csv_reader(file_path)

    # Fixing unintended line breaks:
    merged_data = []

    for sublist in data:
        if sublist and not sublist[0].isdigit():  # Check if the first element is not a number
            # Merge with the last element of the previous list:
            if merged_data:
                merged_data[-1][-1] += ' ' + sublist[0]
                if len(sublist) > 1:  # If there are more elements in the current sublist
                    merged_data[-1].extend(sublist[1:])  # Add the remaining elements to the merged list
            else:
                merged_data.append(sublist)
        else:
            merged_data.append(sublist)

    # View registers with errors:
    for i in merged_data:
        if len(i) != 7:
            print(f"Deleted {i[0]} from {file}")
            deleted_registers.append([i[0], file])

    # Filter out error-free sublists:
    merged_data = [sublist for sublist in merged_data if len(sublist) == 7]
    # Now merged_data only contains sublists with 7 elements

    # Concatenation of data from the files:
    all_data = all_data + merged_data

    print(f"Finished: {file}")

  directory = 'Dataset\Raw data'
  file_path = f"Dataset\Raw data\{file}"
  file_path = f"Dataset\Raw data\{file}"


-----1-----
Working on file: Scraped_Car_Review_dodge.csv
Deleted 376 from Scraped_Car_Review_dodge.csv
Deleted 916 from Scraped_Car_Review_dodge.csv
Deleted 3067 from Scraped_Car_Review_dodge.csv
Deleted 3951 from Scraped_Car_Review_dodge.csv
Deleted 5732 from Scraped_Car_Review_dodge.csv
Deleted 5974 from Scraped_Car_Review_dodge.csv
Deleted 6008 from Scraped_Car_Review_dodge.csv
Deleted 6652 from Scraped_Car_Review_dodge.csv
Deleted 8273 from Scraped_Car_Review_dodge.csv
Finished: Scraped_Car_Review_dodge.csv
-----2-----
Working on file: Scraped_Car_Review_ferrari.csv
Finished: Scraped_Car_Review_ferrari.csv
-----3-----
Working on file: Scraped_Car_Review_fiat.csv
Finished: Scraped_Car_Review_fiat.csv
-----4-----
Working on file: Scraped_Car_Review_ford.csv
Deleted 532 from Scraped_Car_Review_ford.csv
Deleted 709 from Scraped_Car_Review_ford.csv
Deleted 1583 from Scraped_Car_Review_ford.csv
Deleted 4990 from Scraped_Car_Review_ford.csv
Deleted 5596 from Scraped_Car_Review_ford.csv
D

Conversion to DataFrame:

In [76]:
# Convert list to DataFrame:
df = pd.DataFrame(all_data[1:], columns=data[0])

In [77]:
# Eliminate those records with headers:
df = df[df["Review_Date"] != "Review_Date"]

# Remove the column of previous indexes:
del df[""]

In [78]:
len(df)

226763

In [79]:
df.head()

Unnamed: 0,Review_Date,Author_Name,Vehicle_Title,Review_Title,Review,Rating
0,on 10/13/05 15:30 PM (PDT),roadking,2002 Dodge Ram Cargo Van 1500 3dr Van (3.9L 6c...,Great delivery vehicle,It's been a great delivery vehicle for my caf...,4.625
1,on 07/17/05 21:59 PM (PDT),Mark,2002 Dodge Ram Cargo Van 3500 3dr Ext Van (5.2...,Disappointmnet,Bought this car as a commuter vehicle for a v...,2.125
2,on 07/16/02 00:00 AM (PDT),Tom Sheer,2002 Dodge Ram Cargo Van 3500 Maxi 3dr Ext Van...,Sweet van,"This van rocks its the best, lots of \rroom. ...",5.0
3,on 12/29/07 21:57 PM (PST),Keven Smith,2001 Dodge Ram Cargo Van 2500 Maxi 3dr Ext Van...,Keven Smith,Great work vehicle. Drives nice. has lots of ...,4.5
4,on 02/09/05 18:52 PM (PST),VanMan,2001 Dodge Ram Cargo Van 1500 3dr Van (3.9L 6c...,Not what Dodge used to be,Good solid frame and suspension. Well equipp...,2.875


Check the registers that were deleted:

In [80]:
for i in deleted_registers:
    
    # Obtener el nombre del archivo sin la extensión
    file_name = os.path.splitext(i[1])[0]
    
    # Obtener la palabra después del último guion bajo
    brand = file_name.split('_')[-1]

    i[1] = brand.capitalize()

In [81]:
# Create a DataFrame:
columns = ["Index", "Brand"]
deleted = pd.DataFrame(deleted_registers, columns=columns)
deleted.head()

Unnamed: 0,Index,Brand
0,376,Dodge
1,916,Dodge
2,3067,Dodge
3,3951,Dodge
4,5732,Dodge


These were the registers deleted from each brand, due to errors in the original file:

In [82]:
deleted["Brand"].value_counts()

Brand
Ford             17
Toyota           14
Chevrolet        12
Dodge             9
Hyundai           8
Jeep              7
Mercedes-benz     7
Volkswagen        7
Honda             7
Volvo             6
Nissan            6
Mazda             6
Pontiac           5
Lincoln           4
Bmw               4
Cadillac          4
Subaru            4
Acura             4
Lexus             3
Kia               3
Jaguar            3
Tesla             3
Buick             2
Gmc               2
Audi              2
Chrysler          2
Suzuki            2
Porsche           2
Mini              2
Mercury           2
Land-rover        2
Infiniti          2
Maserati          1
Lotus             1
Hummer            1
Genesis           1
Mitsubishi        1
Name: count, dtype: int64

Saving as PARQUET:

In [83]:
df.to_parquet(r'Dataset\Modified data\car_reviews.parquet')