In [22]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import re

# The below scripts will be used for data cleaning purposes

In [23]:
#Start with reading the csv file that has the data
pd.read_csv(r'C:\Users\saifali\OneDrive - Wiley\Learning Resource\PythonProjects\feedback.csv',index_col=0)

Unnamed: 0_level_0,DATE,VENDOR,STAGE,FEEDBACK_AREA,SOURCE,FEEDBACK_TYPE,CRITICALITY,ACTION_TAKEN_BY_VENDOR,instance,ID
GID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SCI,2023-12-05,Alpha Press,Binding,Illustrations,Internal,Negative,Medium,Addressed,1,SCI2312051
POE,2024-11-04,Beta books,Typesetting,Body Text,External,Negative,High,Pending,1,POE2411041
POE,2023-01-10,Beta Books,Typesetting,Front Cover,Internal,Negative,Low,Revised,1,POE2301101
FIC,2024-04-03,Thomson Press,Typesetting,Front Cover,Internal,Negative,Medium,Rejected,1,FIC2404031
COM,2023-12-20,alpha press,Proofreading,Back Cover,Internal,Negative,Medium,Addressed,1,COM2312201
...,...,...,...,...,...,...,...,...,...,...
BIO,2024-07-16,AlBe Publishing,Proofreading,Index,Internal,Negative,Medium,Pending,1,BIO2407161
COM,2023-12-16,Thomson Press,Printing,Body Text,Internal,Positive,Low,Addressed,1,COM2312161
FIC,2024-10-17,Epsilon Ltd.,Typesetting,Body Text,Internal,Negative,Medium,Revised,1,FIC2410171
ROM,2023-05-09,AlBe Publishing,Binding,Illustrations,External,Negative,High,Addressed,1,ROM2305091


In [24]:
#Reordering the column name to ensure the table format remains same always and only rleveant fields are taken in
colummn_order=['ID','GID','DATE','VENDOR','STAGE','FEEDBACK_AREA','SOURCE','FEEDBACK_TYPE','CRITICALITY','ACTION_TAKEN_BY_VENDOR']
feedback=df[colummn_order]

In [25]:
feedback

Unnamed: 0,ID,GID,DATE,VENDOR,STAGE,FEEDBACK_AREA,SOURCE,FEEDBACK_TYPE,CRITICALITY,ACTION_TAKEN_BY_VENDOR
0,HIS2306211,HIS,2023-06-21,Gamma Publishing,Typesetting,Back Cover,External,Negative,Medium,Accepted
2,FIC2407181,FIC,2024-07-18,Thomson Press,Printing,Reference Section,External,Negative,High,Addressed
3,THR2408081,THR,2024-08-08,Delta Inc.,Binding,Body Text,External,Negative,Medium,Pending
4,HIS2404211,HIS,2024-04-21,Beta books,Manuscript,Index,External,Negative,Medium,Addressed
5,FIC2409181,FIC,2024-09-18,Beta Books,Typesetting,Back Cover,External,Negative,High,Revised
...,...,...,...,...,...,...,...,...,...,...
1503,POE2408194,POE,2024-08-19,alpha press,Binding,Body Text,External,Negative,High,Pending
1504,SCI2401253,SCI,2024-01-25,Gamma Publishing,Proofreading,Reference Section,Internal,Negative,Medium,Accepted
1507,ROM2403131,ROM,2024-03-13,Thomson Press,Printing,Illustrations,Internal,Negative,High,Revised
1510,BIO2401061,BIO,2024-01-06,Delta Inc.,Printing,Front Cover,Internal,Negative,Medium,Revised


In [26]:
#Checking if there are inconsistency with Vendor name by returning all uniques values this field holds
feedback.VENDOR.unique()

#This can be done by using some aggregation as well
feedback_by_vendor=feedback.groupby('VENDOR')['ID'].count()
print(feedback_by_vendor)

VENDOR
AlBe Publishing     97
Alpha Press         79
Beta Books          71
Beta books          89
Delta Inc.          97
Epsilon Ltd.        74
Gamma Publishing    82
Thomson Press       90
alpha press         86
Name: ID, dtype: int64


We will start by filtering our dataframe to show only relevant data:

*   **VENDOR:**
    *   Thomson Press needs to be excluded
*   **FEEDBACK_TYPE:**
    *   Only Negative feedback needs to be kept


In [27]:
#Ensuring only the data for relevant vendors are passed on for further steps by creating a list, removed 'Thomson Press' from the list
key_vendors=['Beta Books', 'Delta Inc.', 'Gamma Publishing', 'Alpha Press',
       'Epsilon Ltd.', 'AlBe Publishing', 'alpha press',
       'Beta books']

In [28]:
#creating a new dataframe that shows data for key vendors only
key_vendors_feedback = feedback[feedback['VENDOR'].isin(key_vendors)]

In [29]:
#creating a new dataframe that shows data for key vendors only
key_vendors_feedback = feedback[feedback['FEEDBACK_TYPE'].isin(['Negative'])]
key_vendors_feedback

Unnamed: 0,ID,GID,DATE,VENDOR,STAGE,FEEDBACK_AREA,SOURCE,FEEDBACK_TYPE,CRITICALITY,ACTION_TAKEN_BY_VENDOR
0,HIS2306211,HIS,2023-06-21,Gamma Publishing,Typesetting,Back Cover,External,Negative,Medium,Accepted
2,FIC2407181,FIC,2024-07-18,Thomson Press,Printing,Reference Section,External,Negative,High,Addressed
3,THR2408081,THR,2024-08-08,Delta Inc.,Binding,Body Text,External,Negative,Medium,Pending
4,HIS2404211,HIS,2024-04-21,Beta books,Manuscript,Index,External,Negative,Medium,Addressed
5,FIC2409181,FIC,2024-09-18,Beta Books,Typesetting,Back Cover,External,Negative,High,Revised
...,...,...,...,...,...,...,...,...,...,...
1503,POE2408194,POE,2024-08-19,alpha press,Binding,Body Text,External,Negative,High,Pending
1504,SCI2401253,SCI,2024-01-25,Gamma Publishing,Proofreading,Reference Section,Internal,Negative,Medium,Accepted
1507,ROM2403131,ROM,2024-03-13,Thomson Press,Printing,Illustrations,Internal,Negative,High,Revised
1510,BIO2401061,BIO,2024-01-06,Delta Inc.,Printing,Front Cover,Internal,Negative,Medium,Revised


In [30]:
# View the list of values in each field and check if there are any discrepancies
columns_to_check = ['GID', 'VENDOR', 'STAGE', 'FEEDBACK_AREA', 'SOURCE',
       'FEEDBACK_TYPE', 'CRITICALITY', 'ACTION_TAKEN_BY_VENDOR']  # Only the columns you want to see
for column in columns_to_check:
    print(f"\nUnique values in {column}:")
    print(key_vendors_feedback[column].unique())


Unique values in GID:
['HIS' 'FIC' 'THR' 'COM' 'BIO' 'POE' 'SCI' 'ROM']

Unique values in VENDOR:
['Gamma Publishing' 'Thomson Press' 'Delta Inc.' 'Beta books' 'Beta Books'
 'alpha press' 'Alpha Press' 'Epsilon Ltd.' 'AlBe Publishing']

Unique values in STAGE:
['Typesetting' 'Printing' 'Binding' 'Manuscript' 'Proofreading'
 'proof reading']

Unique values in FEEDBACK_AREA:
['Back Cover' 'Reference Section' 'Body Text' 'Index' 'Front Cover'
 'front cover' 'Illustrations' 'Body']

Unique values in SOURCE:
['External' 'Internal']

Unique values in FEEDBACK_TYPE:
['Negative']

Unique values in CRITICALITY:
['Medium' 'High' 'Low']

Unique values in ACTION_TAKEN_BY_VENDOR:
['Accepted' 'Addressed' 'Pending' 'Revised' 'Rejected' nan]


We can see that there are a few inconsistencies:

*   **VENDOR:**
    *   Variations in the name of Alpha and Beta.
    *   Further, they (Alpha and Beta) need to be grouped as AlBe Publishing (recent merger).
*   **STAGE:**
    *   Variations in how "Proofreading" is written.
*   **FEEDBACK_AREA:**
    *   Variations in how "Front Cover" is written.

In [31]:
# All variations of Alpha and Beta Press be grouped in AlBe Publishing
vendors_to_replace = ['Beta Books','Alpha Press','alpha press', 'Beta books']

for vendor in vendors_to_replace:
   key_vendors_feedback = key_vendors_feedback.replace(vendor, 'AlBe Publishing')

key_vendors_feedback.groupby('VENDOR')['ID'].count()

VENDOR
AlBe Publishing     414
Delta Inc.           95
Epsilon Ltd.         73
Gamma Publishing     81
Thomson Press        87
Name: ID, dtype: int64

In [32]:
# Replace proof reading with Proofreading
stage_to_replace = ['proof reading']

for stage in stage_to_replace:
   key_vendors_feedback = key_vendors_feedback.replace(stage, 'Proofreading')

key_vendors_feedback.groupby('STAGE')['ID'].count()

STAGE
Binding         149
Manuscript      144
Printing        159
Proofreading    153
Typesetting     145
Name: ID, dtype: int64

In [33]:
# Replace front cover with Front Cover
FEEDBACK_AREA_to_replace = ['front cover']

for FEEDBACK_AREA in FEEDBACK_AREA_to_replace:
   key_vendors_feedback = key_vendors_feedback.replace(FEEDBACK_AREA, 'Front Cover')

key_vendors_feedback.groupby('FEEDBACK_AREA')['ID'].count()

FEEDBACK_AREA
Back Cover           147
Body                   6
Body Text            118
Front Cover          127
Illustrations        122
Index                112
Reference Section    118
Name: ID, dtype: int64

In [34]:
key_vendors_feedback.to_excel("feedback_data.xlsx", index=False)