In [24]:
import pandas as pd

# Load only the desired columns
df = pd.read_csv('all_resumes_merged.csv', encoding='latin1', usecols=['ID', 'Resume', 'Category'], low_memory=False)

In [26]:
# Check column names and data types
print(df.columns)
print(df.dtypes)

# Check unique data types in each column
for col in df.columns:
    unique_types = df[col].apply(type).unique()
    print(f"{col}: {unique_types}")

# View sample data
print(df.head(10))

Index(['ID', 'Resume', 'Category'], dtype='object')
ID          object
Resume      object
Category    object
dtype: object
ID: [<class 'str'> <class 'float'>]
Resume: [<class 'str'>]
Category: [<class 'str'> <class 'float'>]
   ID                                             Resume  \
0   1  Database Administrator- Family Private Care LL...   
1   2  Database Administrator Database Administrator ...   
2   3  Oracle Database Administrator Oracle Database ...   
3   4  Amazon Redshift Administrator and ETL Develope...   
4   5  Scrum Master Scrum Master Scrum Master Richmon...   
5   6  Oracle Database Administrator Oracle Database ...   
6   7  Oracle Database Administrator Oracle Database ...   
7   8  Lead Database Administrator/Developer Lead Dat...   
8   9  Database Administrator / Database Developer Da...   
9  10  Oracle Database Administrator Oracle Database ...   

                 Category  
0  Database Administrator  
1  Database Administrator  
2  Database Administrator  
3 

In [28]:
# Handle mixed data types
# ID: Convert to string
df['ID'] = df['ID'].astype(str).replace('nan', 'Unknown')

# Resume: Convert to string
df['Resume'] = df['Resume'].astype(str).replace('nan', 'Unknown')

# Category: Convert to categorical
df['Category'] = df['Category'].astype(str).replace('nan', 'Unknown').astype('category')

In [30]:
# Save as UTF-8
df.to_csv('cleaned_resumes_utf8.csv', index=False, encoding='utf-8')

In [32]:
# Reload the UTF-8 file to verify
df_cleaned = pd.read_csv('cleaned_resumes_utf8.csv')
print(df_cleaned.dtypes)
print(df_cleaned.head(10))

ID          object
Resume      object
Category    object
dtype: object
   ID                                             Resume  \
0   1  Database Administrator- Family Private Care LL...   
1   2  Database Administrator Database Administrator ...   
2   3  Oracle Database Administrator Oracle Database ...   
3   4  Amazon Redshift Administrator and ETL Develope...   
4   5  Scrum Master Scrum Master Scrum Master Richmon...   
5   6  Oracle Database Administrator Oracle Database ...   
6   7  Oracle Database Administrator Oracle Database ...   
7   8  Lead Database Administrator/Developer Lead Dat...   
8   9  Database Administrator / Database Developer Da...   
9  10  Oracle Database Administrator Oracle Database ...   

                 Category  
0  Database Administrator  
1  Database Administrator  
2  Database Administrator  
3  Database Administrator  
4  Database Administrator  
5  Database Administrator  
6  Database Administrator  
7  Database Administrator  
8  Database Admi

In [34]:
df_cleaned.head()

Unnamed: 0,ID,Resume,Category
0,1,Database Administrator- Family Private Care LL...,Database Administrator
1,2,Database Administrator Database Administrator ...,Database Administrator
2,3,Oracle Database Administrator Oracle Database ...,Database Administrator
3,4,Amazon Redshift Administrator and ETL Develope...,Database Administrator
4,5,Scrum Master Scrum Master Scrum Master Richmon...,Database Administrator


In [36]:
df_cleaned.isnull().sum()

ID          0
Resume      0
Category    0
dtype: int64

In [38]:
import pandas as pd

# Load the cleaned dataset
df_cleaned = pd.read_csv('cleaned_resumes_utf8.csv')

In [40]:
# Delete the ID column
df_cleaned = df_cleaned.drop(columns=['ID'])

In [42]:
df_cleaned = df_cleaned[['Resume', 'Category']]

In [44]:
# Check data types
print("Data types:\n", df_cleaned.dtypes)

# Check unique data types
for col in df_cleaned.columns:
    unique_types = df_cleaned[col].apply(type).unique()
    print(f"{col}: {unique_types}")

# View sample data
print("Sample data:\n", df_cleaned.head(10))

Data types:
 Resume      object
Category    object
dtype: object
Resume: [<class 'str'>]
Category: [<class 'str'>]
Sample data:
                                               Resume                Category
0  Database Administrator- Family Private Care LL...  Database Administrator
1  Database Administrator Database Administrator ...  Database Administrator
2  Oracle Database Administrator Oracle Database ...  Database Administrator
3  Amazon Redshift Administrator and ETL Develope...  Database Administrator
4  Scrum Master Scrum Master Scrum Master Richmon...  Database Administrator
5  Oracle Database Administrator Oracle Database ...  Database Administrator
6  Oracle Database Administrator Oracle Database ...  Database Administrator
7  Lead Database Administrator/Developer Lead Dat...  Database Administrator
8  Database Administrator / Database Developer Da...  Database Administrator
9  Oracle Database Administrator Oracle Database ...  Database Administrator


In [46]:
# Ensure Resume is string
df_cleaned['Resume'] = df_cleaned['Resume'].astype(str).replace('nan', 'Unknown')

# Ensure Category is categorical
df_cleaned['Category'] = df_cleaned['Category'].astype(str).replace('nan', 'Unknown').astype('category')

In [48]:
# Save the new dataset
df_cleaned.to_csv('resumes_final_utf8.csv', index=False, encoding='utf-8')

In [50]:
df_cleaned.head()

Unnamed: 0,Resume,Category
0,Database Administrator- Family Private Care LL...,Database Administrator
1,Database Administrator Database Administrator ...,Database Administrator
2,Oracle Database Administrator Oracle Database ...,Database Administrator
3,Amazon Redshift Administrator and ETL Develope...,Database Administrator
4,Scrum Master Scrum Master Scrum Master Richmon...,Database Administrator


In [52]:
import pandas as pd

# Load the dataset
df_final = pd.read_csv('resumes_final_utf8.csv')

In [54]:
# Count NaN and 'Unknown' in Resume
resume_nan = df_final['Resume'].isna().sum()
resume_unknown = (df_final['Resume'] == 'Unknown').sum()
resume_total = resume_nan + resume_unknown

# Count NaN and 'Unknown' in Category
category_nan = df_final['Category'].isna().sum()
category_unknown = (df_final['Category'] == 'Unknown').sum()
category_total = category_nan + category_unknown

# Print results
print(f"Resume column:")
print(f"  NaN count: {resume_nan}")
print(f"  'Unknown' count: {resume_unknown}")
print(f"  Total NaN or 'Unknown': {resume_total}")
print(f"Category column:")
print(f"  NaN count: {category_nan}")
print(f"  'Unknown' count: {category_unknown}")
print(f"  Total NaN or 'Unknown': {category_total}")

# Total rows in the dataset
total_rows = len(df_final)
print(f"Total rows in dataset: {total_rows}")

Resume column:
  NaN count: 0
  'Unknown' count: 0
  Total NaN or 'Unknown': 0
Category column:
  NaN count: 0
  'Unknown' count: 32
  Total NaN or 'Unknown': 32
Total rows in dataset: 21671


In [1]:
import pandas as pd

# Load the dataset
df_final = pd.read_csv('resumes_final_utf8.csv')

In [3]:
# Remove rows where Category is 'Unknown'
df_final = df_final[df_final['Category'] != 'Unknown']

In [5]:
# Verify row count
print(f"Total rows after removing 'Unknown': {len(df_final)}")

# Check for 'Unknown' in Category
category_unknown = (df_final['Category'] == 'Unknown').sum()
print(f"'Unknown' count in Category: {category_unknown}")

# Check data types
print("Data types:\n", df_final.dtypes)

# View sample data
print("Sample data:\n", df_final.head(10))

Total rows after removing 'Unknown': 21639
'Unknown' count in Category: 0
Data types:
 Resume      object
Category    object
dtype: object
Sample data:
                                               Resume                Category
0  Database Administrator- Family Private Care LL...  Database Administrator
1  Database Administrator Database Administrator ...  Database Administrator
2  Oracle Database Administrator Oracle Database ...  Database Administrator
3  Amazon Redshift Administrator and ETL Develope...  Database Administrator
4  Scrum Master Scrum Master Scrum Master Richmon...  Database Administrator
5  Oracle Database Administrator Oracle Database ...  Database Administrator
6  Oracle Database Administrator Oracle Database ...  Database Administrator
7  Lead Database Administrator/Developer Lead Dat...  Database Administrator
8  Database Administrator / Database Developer Da...  Database Administrator
9  Oracle Database Administrator Oracle Database ...  Database Administrator


In [7]:
# Save the updated dataset
df_final.to_csv('resumes_final_cleaned_utf8.csv', index=False, encoding='utf-8')

In [9]:
# Load and verify the saved file
df_verified = pd.read_csv('resumes_final_cleaned_utf8.csv')
print("Columns:", df_verified.columns)
print("Total rows:", len(df_verified))
print("Data types:\n", df_verified.dtypes)
print("Sample data:\n", df_verified.head(10))

Columns: Index(['Resume', 'Category'], dtype='object')
Total rows: 21639
Data types:
 Resume      object
Category    object
dtype: object
Sample data:
                                               Resume                Category
0  Database Administrator- Family Private Care LL...  Database Administrator
1  Database Administrator Database Administrator ...  Database Administrator
2  Oracle Database Administrator Oracle Database ...  Database Administrator
3  Amazon Redshift Administrator and ETL Develope...  Database Administrator
4  Scrum Master Scrum Master Scrum Master Richmon...  Database Administrator
5  Oracle Database Administrator Oracle Database ...  Database Administrator
6  Oracle Database Administrator Oracle Database ...  Database Administrator
7  Lead Database Administrator/Developer Lead Dat...  Database Administrator
8  Database Administrator / Database Developer Da...  Database Administrator
9  Oracle Database Administrator Oracle Database ...  Database Administrator


In [15]:
# Check column names
print("Columns:", df_final.columns)

Columns: Index(['Resume', 'Category'], dtype='object')


In [11]:
import pandas as pd

# Load the dataset
df = pd.read_csv('Cleaned_Book2.csv')

In [13]:
# Check column names
print("Columns:", df.columns)

Columns: Index(['Resume', 'Category'], dtype='object')


In [17]:
# Count NaN and 'Unknown' in Resume
resume_nan = df['Resume'].isna().sum()
resume_unknown = (df['Resume'] == 'Unknown').sum()
resume_total = resume_nan + resume_unknown

# Count NaN and 'Unknown' in Category
category_nan = df['Category'].isna().sum()
category_unknown = (df['Category'] == 'Unknown').sum()
category_total = category_nan + category_unknown

# Count rows where both or either column is NaN/'Unknown'
both_condition = ((df['Resume'].isna() | (df['Resume'] == 'Unknown')) & 
                  (df['Category'].isna() | (df['Category'] == 'Unknown')))
both_count = both_condition.sum()

either_condition = ((df['Resume'].isna() | (df['Resume'] == 'Unknown')) | 
                    (df['Category'].isna() | (df['Category'] == 'Unknown')))
either_count = either_condition.sum()

# Print results
print(f"Resume column:")
print(f"  NaN count: {resume_nan}")
print(f"  'Unknown' count: {resume_unknown}")
print(f"  Total NaN or 'Unknown': {resume_total}")
print(f"Category column:")
print(f"  NaN count: {category_nan}")
print(f"  'Unknown' count: {category_unknown}")
print(f"  Total NaN or 'Unknown': {category_total}")
print(f"Total rows in dataset: {len(df)}")
print(f"Rows where both Resume and Category are NaN or 'Unknown': {both_count}")
print(f"Rows where either Resume or Category is NaN or 'Unknown': {either_count}")

Resume column:
  NaN count: 0
  'Unknown' count: 0
  Total NaN or 'Unknown': 0
Category column:
  NaN count: 0
  'Unknown' count: 0
  Total NaN or 'Unknown': 0
Total rows in dataset: 1001
Rows where both Resume and Category are NaN or 'Unknown': 0
Rows where either Resume or Category is NaN or 'Unknown': 0


In [19]:
import pandas as pd

# Load the datasets
df_final = pd.read_csv('resumes_final_cleaned_utf8.csv')
df_book2 = pd.read_csv('Cleaned_Book2.csv')  # Use encoding='latin1' if needed

In [21]:
# Check columns
print("resumes_final_cleaned_utf8.csv columns:", df_final.columns)
print("Cleaned_Book2.csv columns:", df_book2.columns)

resumes_final_cleaned_utf8.csv columns: Index(['Resume', 'Category'], dtype='object')
Cleaned_Book2.csv columns: Index(['Resume', 'Category'], dtype='object')


In [23]:
# Append the datasets
df_combined = pd.concat([df_final, df_book2], ignore_index=True)

In [25]:
# Count NaN and 'Unknown' in Resume
resume_nan = df_combined['Resume'].isna().sum()
resume_unknown = (df_combined['Resume'] == 'Unknown').sum()
resume_total = resume_nan + resume_unknown

# Count NaN and 'Unknown' in Category
category_nan = df_combined['Category'].isna().sum()
category_unknown = (df_combined['Category'] == 'Unknown').sum()
category_total = category_nan + category_unknown

# Count rows where both or either column is NaN/'Unknown'
both_condition = ((df_combined['Resume'].isna() | (df_combined['Resume'] == 'Unknown')) & 
                  (df_combined['Category'].isna() | (df_combined['Category'] == 'Unknown')))
both_count = both_condition.sum()

either_condition = ((df_combined['Resume'].isna() | (df_combined['Resume'] == 'Unknown')) | 
                    (df_combined['Category'].isna() | (df_combined['Category'] == 'Unknown')))
either_count = either_condition.sum()

# Print results
print(f"Combined dataset:")
print(f"Resume column:")
print(f"  NaN count: {resume_nan}")
print(f"  'Unknown' count: {resume_unknown}")
print(f"  Total NaN or 'Unknown': {resume_total}")
print(f"Category column:")
print(f"  NaN count: {category_nan}")
print(f"  'Unknown' count: {category_unknown}")
print(f"  Total NaN or 'Unknown': {category_total}")
print(f"Total rows in combined dataset: {len(df_combined)}")
print(f"Rows where both Resume and Category are NaN or 'Unknown': {both_count}")
print(f"Rows where either Resume or Category is NaN or 'Unknown': {either_count}")

Combined dataset:
Resume column:
  NaN count: 0
  'Unknown' count: 0
  Total NaN or 'Unknown': 0
Category column:
  NaN count: 0
  'Unknown' count: 0
  Total NaN or 'Unknown': 0
Total rows in combined dataset: 22640
Rows where both Resume and Category are NaN or 'Unknown': 0
Rows where either Resume or Category is NaN or 'Unknown': 0


In [27]:
# Ensure consistent data types
df_combined['Resume'] = df_combined['Resume'].astype(str).replace('nan', 'Unknown')
df_combined['Category'] = df_combined['Category'].astype(str).replace('nan', 'Unknown').astype('category')

In [29]:
# Verify the combined dataset
print("After cleaning:")
print(f"Total rows: {len(df_combined)}")
print("Data types:\n", df_combined.dtypes)
print("Sample data:\n", df_combined.head(10))

After cleaning:
Total rows: 22640
Data types:
 Resume        object
Category    category
dtype: object
Sample data:
                                               Resume                Category
0  Database Administrator- Family Private Care LL...  Database Administrator
1  Database Administrator Database Administrator ...  Database Administrator
2  Oracle Database Administrator Oracle Database ...  Database Administrator
3  Amazon Redshift Administrator and ETL Develope...  Database Administrator
4  Scrum Master Scrum Master Scrum Master Richmon...  Database Administrator
5  Oracle Database Administrator Oracle Database ...  Database Administrator
6  Oracle Database Administrator Oracle Database ...  Database Administrator
7  Lead Database Administrator/Developer Lead Dat...  Database Administrator
8  Database Administrator / Database Developer Da...  Database Administrator
9  Oracle Database Administrator Oracle Database ...  Database Administrator


In [31]:
# Save the combined dataset
df_combined.to_csv('resumes_combined_utf8.csv', index=False, encoding='utf-8')

In [33]:
# Load and verify the saved file
df_verified = pd.read_csv('resumes_combined_utf8.csv')
print("Saved file:")
print("Columns:", df_verified.columns)
print("Total rows:", len(df_verified))
print("Data types:\n", df_verified.dtypes)
print("Sample data:\n", df_verified.head(10))

Saved file:
Columns: Index(['Resume', 'Category'], dtype='object')
Total rows: 22640
Data types:
 Resume      object
Category    object
dtype: object
Sample data:
                                               Resume                Category
0  Database Administrator- Family Private Care LL...  Database Administrator
1  Database Administrator Database Administrator ...  Database Administrator
2  Oracle Database Administrator Oracle Database ...  Database Administrator
3  Amazon Redshift Administrator and ETL Develope...  Database Administrator
4  Scrum Master Scrum Master Scrum Master Richmon...  Database Administrator
5  Oracle Database Administrator Oracle Database ...  Database Administrator
6  Oracle Database Administrator Oracle Database ...  Database Administrator
7  Lead Database Administrator/Developer Lead Dat...  Database Administrator
8  Database Administrator / Database Developer Da...  Database Administrator
9  Oracle Database Administrator Oracle Database ...  Database Adm

In [35]:
import pandas as pd

# Load the datasets
df_combined = pd.read_csv('resumes_combined_utf8.csv')
df_talentmatch = pd.read_csv('Cleaned_Dataset_TalentMatch.csv')  # Use encoding='latin1' if needed

In [37]:
# Check columns
print("resumes_combined_utf8.csv columns:", df_combined.columns)
print("Cleaned_Dataset_TalentMatch.csv columns:", df_talentmatch.columns)

resumes_combined_utf8.csv columns: Index(['Resume', 'Category'], dtype='object')
Cleaned_Dataset_TalentMatch.csv columns: Index(['Resume', 'Category'], dtype='object')


In [39]:
# Append the datasets
df_combined = pd.concat([df_combined, df_talentmatch], ignore_index=True)

In [41]:
# Count NaN and 'Unknown' in Resume
resume_nan = df_combined['Resume'].isna().sum()
resume_unknown = (df_combined['Resume'] == 'Unknown').sum()
resume_total = resume_nan + resume_unknown

# Count NaN and 'Unknown' in Category
category_nan = df_combined['Category'].isna().sum()
category_unknown = (df_combined['Category'] == 'Unknown').sum()
category_total = category_nan + category_unknown

# Count rows where both or either column is NaN/'Unknown'
both_condition = ((df_combined['Resume'].isna() | (df_combined['Resume'] == 'Unknown')) & 
                  (df_combined['Category'].isna() | (df_combined['Category'] == 'Unknown')))
both_count = both_condition.sum()

either_condition = ((df_combined['Resume'].isna() | (df_combined['Resume'] == 'Unknown')) | 
                    (df_combined['Category'].isna() | (df_combined['Category'] == 'Unknown')))
either_count = either_condition.sum()

# Print results
print(f"Combined dataset:")
print(f"Resume column:")
print(f"  NaN count: {resume_nan}")
print(f"  'Unknown' count: {resume_unknown}")
print(f"  Total NaN or 'Unknown': {resume_total}")
print(f"Category column:")
print(f"  NaN count: {category_nan}")
print(f"  'Unknown' count: {category_unknown}")
print(f"  Total NaN or 'Unknown': {category_total}")
print(f"Total rows in combined dataset: {len(df_combined)}")
print(f"Rows where both Resume and Category are NaN or 'Unknown': {both_count}")
print(f"Rows where either Resume or Category is NaN or 'Unknown': {either_count}")

Combined dataset:
Resume column:
  NaN count: 0
  'Unknown' count: 0
  Total NaN or 'Unknown': 0
Category column:
  NaN count: 0
  'Unknown' count: 0
  Total NaN or 'Unknown': 0
Total rows in combined dataset: 23640
Rows where both Resume and Category are NaN or 'Unknown': 0
Rows where either Resume or Category is NaN or 'Unknown': 0


In [43]:
# Ensure consistent data types
df_combined['Resume'] = df_combined['Resume'].astype(str).replace('nan', 'Unknown')
df_combined['Category'] = df_combined['Category'].astype(str).replace('nan', 'Unknown').astype('category')

In [45]:
# Verify the combined dataset
print("\nAfter cleaning:")
print(f"Total rows: {len(df_combined)}")
print("Data types:\n", df_combined.dtypes)
print("Sample data:\n", df_combined.head(10))


After cleaning:
Total rows: 23640
Data types:
 Resume        object
Category    category
dtype: object
Sample data:
                                               Resume                Category
0  Database Administrator- Family Private Care LL...  Database Administrator
1  Database Administrator Database Administrator ...  Database Administrator
2  Oracle Database Administrator Oracle Database ...  Database Administrator
3  Amazon Redshift Administrator and ETL Develope...  Database Administrator
4  Scrum Master Scrum Master Scrum Master Richmon...  Database Administrator
5  Oracle Database Administrator Oracle Database ...  Database Administrator
6  Oracle Database Administrator Oracle Database ...  Database Administrator
7  Lead Database Administrator/Developer Lead Dat...  Database Administrator
8  Database Administrator / Database Developer Da...  Database Administrator
9  Oracle Database Administrator Oracle Database ...  Database Administrator


In [47]:
# Save the combined dataset
df_combined.to_csv('resumes_combined_updated_utf8.csv', index=False, encoding='utf-8')

In [49]:
# Load and verify the saved file
df_verified = pd.read_csv('resumes_combined_updated_utf8.csv')
print("\nSaved file:")
print("Columns:", df_verified.columns)
print("Total rows:", len(df_verified))
print("Data types:\n", df_verified.dtypes)
print("Sample data:\n", df_verified.head(10))


Saved file:
Columns: Index(['Resume', 'Category'], dtype='object')
Total rows: 23640
Data types:
 Resume      object
Category    object
dtype: object
Sample data:
                                               Resume                Category
0  Database Administrator- Family Private Care LL...  Database Administrator
1  Database Administrator Database Administrator ...  Database Administrator
2  Oracle Database Administrator Oracle Database ...  Database Administrator
3  Amazon Redshift Administrator and ETL Develope...  Database Administrator
4  Scrum Master Scrum Master Scrum Master Richmon...  Database Administrator
5  Oracle Database Administrator Oracle Database ...  Database Administrator
6  Oracle Database Administrator Oracle Database ...  Database Administrator
7  Lead Database Administrator/Developer Lead Dat...  Database Administrator
8  Database Administrator / Database Developer Da...  Database Administrator
9  Oracle Database Administrator Oracle Database ...  Database Ad