In [10]:
import pandas as pd
import json

# Read the CSV file into a DataFrame, skipping the first two rows
contact_info_df = pd.read_csv("contacts.csv", skiprows=2, header=None)

# Debug: Display the first few rows to understand the structure
print("Initial DataFrame:")
print(contact_info_df.head())



Initial DataFrame:
      0         1         2                            3
0  3765   Mariana     Ellis      mariana.ellis@rossi.org
1  4187     Sofie     Woods      sofie.woods@riviere.com
2  4941  Jeanette  Iannotti  jeanette.iannotti@yahoo.com
3  2199    Samuel   Sorgatz     samuel.sorgatz@gmail.com
4  5650   Socorro      Luna     socorro.luna@hotmail.com


In [11]:
# Concatenate the columns into a list of strings
contact_info_df["contact_info"] = contact_info_df.apply(lambda row: row.astype(str).to_list(), axis=1)


In [12]:
# Convert the concatenated string into a valid JSON format
def convert_to_json(lst):
    if len(lst) == 4:
        return json.dumps({
            "contact_id": int(lst[0]),
            "name": f"{lst[1]} {lst[2]}",
            "email": lst[3]
        })
    else:
        return None

# Apply the function to each row
contact_info_df["contact_info"] = contact_info_df["contact_info"].apply(convert_to_json)


In [13]:
# Function to parse JSON string
def parse_json_string(json_str):
    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {e} for string: {json_str}")
        return None

# Apply the function to each cell in the DataFrame
contact_info_df["contact_info"] = contact_info_df["contact_info"].apply(parse_json_string)

# Debug: Display the parsed JSON objects
print("\nParsed JSON objects:")
print(contact_info_df["contact_info"].head())



Parsed JSON objects:
0    {'contact_id': 3765, 'name': 'Mariana Ellis', ...
1    {'contact_id': 4187, 'name': 'Sofie Woods', 'e...
2    {'contact_id': 4941, 'name': 'Jeanette Iannott...
3    {'contact_id': 2199, 'name': 'Samuel Sorgatz',...
4    {'contact_id': 5650, 'name': 'Socorro Luna', '...
Name: contact_info, dtype: object


In [14]:
# Extract required information from the parsed JSON and create a new DataFrame
contacts_df_clean = pd.DataFrame(contact_info_df["contact_info"].tolist())

# Debug: Display the cleaned DataFrame
print("\nCleaned DataFrame with JSON parsed:")
print(contacts_df_clean.head())



Cleaned DataFrame with JSON parsed:
   contact_id               name                        email
0        3765      Mariana Ellis      mariana.ellis@rossi.org
1        4187        Sofie Woods      sofie.woods@riviere.com
2        4941  Jeanette Iannotti  jeanette.iannotti@yahoo.com
3        2199     Samuel Sorgatz     samuel.sorgatz@gmail.com
4        5650       Socorro Luna     socorro.luna@hotmail.com


In [15]:
# Split the 'name' information from the 'contact_info' column into 'first_name' and 'last_name'
contacts_df_clean[['first_name', 'last_name']] = contacts_df_clean['name'].str.split(' ', n=1, expand=True)


In [16]:
# Drop unnecessary columns
contacts_df_clean.drop(columns=["name"], inplace=True)

# Reorder the columns
contacts_df_clean = contacts_df_clean[['contact_id', 'first_name', 'last_name', 'email']]


In [17]:
# Display the final DataFrame
print("\nFinal DataFrame:")
print(contacts_df_clean)



Final DataFrame:
     contact_id    first_name   last_name  \
0          3765       Mariana       Ellis   
1          4187         Sofie       Woods   
2          4941      Jeanette    Iannotti   
3          2199        Samuel     Sorgatz   
4          5650       Socorro        Luna   
..          ...           ...         ...   
994        3684       Whitney       Noack   
995        5784     Gelsomina  Migliaccio   
996        1498   Evangelista     Pereira   
997        6073        Gareth     Comolli   
998        4939  Michelangelo        Hess   

                                         email  
0                      mariana.ellis@rossi.org  
1                      sofie.woods@riviere.com  
2                  jeanette.iannotti@yahoo.com  
3                     samuel.sorgatz@gmail.com  
4                     socorro.luna@hotmail.com  
..                                         ...  
994             whitney.noack@laboratorios.org  
995              gelsomina.migliaccio@junk.com  


In [18]:
# Check the data types of the final DataFrame
print("\nData types of final DataFrame columns:")
print(contacts_df_clean.dtypes)



Data types of final DataFrame columns:
contact_id     int64
first_name    object
last_name     object
email         object
dtype: object


In [19]:
# Export the DataFrame as a CSV file
contacts_df_clean.to_csv("cleanedcontacts.csv", index=False)
