In [2]:
import pandas as pd, numpy as np
import random
import re

In [5]:
dtype = {
    'Unnamed: 0': 'int',
    'Title': 'str',
    'location': 'str',
    'Posting_date': 'str',
    'DESCRIPTION': 'str',
    'BASIC QUALIFICATIONS': 'str',
    'PREFERRED QUALIFICATIONS': 'str'
}

# Importez la data avec les types de données spécifiés
amazon_jobs_dataset = pd.read_csv('amazon_jobs_dataset.csv', dtype=dtype)

# Vérifiez les types de données importés
print(amazon_jobs_dataset.dtypes)

Unnamed: 0                   int32
Title                       object
location                    object
Posting_date                object
DESCRIPTION                 object
BASIC QUALIFICATIONS        object
PREFERRED QUALIFICATIONS    object
dtype: object


In [6]:
amazon_jobs_dataset = amazon_jobs_dataset.drop(amazon_jobs_dataset.columns[0], axis=1)
amazon_jobs_dataset.dtypes

Title                       object
location                    object
Posting_date                object
DESCRIPTION                 object
BASIC QUALIFICATIONS        object
PREFERRED QUALIFICATIONS    object
dtype: object

In [7]:
# Créez une fonction de nettoyage pour convertir les valeurs en str et supprimer les caractères indésirables
def nettoyer_donnees(data):
    for colonne in data.columns:
        if data[colonne].dtype == 'object':
            # Supprimer les caractères spéciaux
            data[colonne] = data[colonne].apply(lambda x: re.sub(r'[{}\'"]', '', str(x)))
    
    # Convertir toutes les colonnes en str
    data = data.applymap(lambda x: str(x) if not isinstance(x, str) else x)
    
    return data

new_data = nettoyer_donnees(amazon_jobs_dataset)

In [8]:
new_data.dtypes

Title                       object
location                    object
Posting_date                object
DESCRIPTION                 object
BASIC QUALIFICATIONS        object
PREFERRED QUALIFICATIONS    object
dtype: object

In [9]:

def effacer_contenu(data, proportion):
    for colonne in data.columns:
        if data[colonne].dtype == 'object':
            # Déterminer le nombre de cellules à vider en fonction de la proportion
            nombre_lignes = len(data)
            nombre_a_vider = int(proportion * nombre_lignes)

            # Générer des indices de lignes aléatoires à vider
            indices_a_vider = random.sample(range(nombre_lignes), nombre_a_vider)

            # Effacer le contenu des cellules aléatoires dans la colonne
            data[colonne] = data[colonne].astype(str)
            data.loc[indices_a_vider, colonne] = None
    return data


In [10]:
# Utilisation de la fonction pour effacer aléatoirement le contenu
proportion_a_vider = 0.2  # pour vider 20% des cellules


# Créer un nouveau DataFrame avec le contenu modifié
amazon_dataset_20 = effacer_contenu(new_data, proportion_a_vider)


In [11]:
amazon_dataset_20.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3493 entries, 0 to 3492
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Title                     2795 non-null   object
 1   location                  2795 non-null   object
 2   Posting_date              2795 non-null   object
 3   DESCRIPTION               2795 non-null   object
 4   BASIC QUALIFICATIONS      2795 non-null   object
 5   PREFERRED QUALIFICATIONS  2795 non-null   object
dtypes: object(6)
memory usage: 163.9+ KB


In [12]:
# Enregistrer le DataFrame en tant que fichier CSV
nom_fichier_csv = "amazon_dataset-20.csv"
amazon_dataset_20.to_csv(nom_fichier_csv, index=False) 


In [13]:
#préparer le train data  
amazon_train_dataset = amazon_dataset_20[~amazon_dataset_20.isnull().any(axis=1)]

In [14]:
amazon_train_dataset

Unnamed: 0,Title,location,Posting_date,DESCRIPTION,BASIC QUALIFICATIONS,PREFERRED QUALIFICATIONS
2,Software Development Engineer,"IN, KA, Bangalore","March 1, 2018",Amazon is driven by being “the world’s most cu...,· Bachelor’s Degree in Computer Science or rel...,· Experience building complex software systems...
4,Software Development Engineer - Amazon Lex,"US, WA, Seattle","March 1, 2018",Have you ever wondered what it takes to build ...,· Bachelors Degree in Computer Science or a re...,· Masters or PhD in Computer Science· Experien...
5,Software Development Engineer - Amazon Lex,"US, WA, Seattle","March 1, 2018",Have you ever wondered what it takes to build ...,· Bachelors Degree in Computer Science or a re...,· Masters or PhD in Computer Science· Experien...
9,Software Development Manager - Amazon Cloud Cam,"US, CA, Cupertino","March 1, 2018",The Amazon Devices team designs and engineers ...,· Bachelor’s Degree in Computer Science or rel...,· Master’s degree in Computer Science or relat...
15,UI Developer,"US, WA, Seattle","February 28, 2018",How can we build talent pipelines to support A...,· Bachelors Degree in Computer Science or a re...,· Familiarity with R and Shiny web framework· ...
...,...,...,...,...,...,...
3484,Amazon Kindle Software Engineer,"US, WA, Seattle","November 19, 2012","The Amazon Kindle team is coming to Tel Aviv, ...",· BS or higher degree in Computer Science or a...,· Should be very interested in writing lots of...
3486,Software Development Engineer II,"CA, BC, Vancouver","October 21, 2012",Come be part of a team that will shape and inf...,· Bachelor’s Degree in Computer Science or rel...,All applicants must meet the basic qualificati...
3488,Software Engineer,"US, NV, Las Vegas","August 29, 2012",What are you currently doing? We’re building a...,· 5+ years production Java software developmen...,· 7+ years Java software development in a prod...
3489,Software Development Engineer,"US, WA, Seattle","April 23, 2012",At Amazon Voice and Advanced Natural Shopping ...,* Bachelor’s Degree in Computer Science or rel...,* Experience building complex software systems...


In [15]:
amazon_train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 937 entries, 2 to 3492
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Title                     937 non-null    object
 1   location                  937 non-null    object
 2   Posting_date              937 non-null    object
 3   DESCRIPTION               937 non-null    object
 4   BASIC QUALIFICATIONS      937 non-null    object
 5   PREFERRED QUALIFICATIONS  937 non-null    object
dtypes: object(6)
memory usage: 51.2+ KB


In [16]:
# Enregistrer le train DataFrame en tant que fichier CSV
nom_fichier_csv = "train_dataset.csv"
amazon_train_dataset.to_csv(nom_fichier_csv, index=False) 
