## CHARGEMENT

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("C:\\Users\\olake\\Desktop\\PROJETS_CAS_ENTREPRISE\\PROJET_1\\data\\donnees_brutes\\books.csv")
data.head()

Unnamed: 0,title,price,rating
0,A Light in the Attic,Â£51.77,3
1,Tipping the Velvet,Â£53.74,1
2,Soumission,Â£50.10,1
3,Sharp Objects,Â£47.82,4
4,Sapiens: A Brief History of Humankind,Â£54.23,5


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   100 non-null    object
 1   price   100 non-null    object
 2   rating  100 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 2.5+ KB


## NETTOYAGE

In [4]:
import re
import unicodedata

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove accents
    text = ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )
    
    # Ensure UTF-8 encoding
    text = text.encode('utf-8', 'ignore').decode('utf-8')

    # Remove special characters and digits
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [5]:
data["cleaned_title"] = data["title"].apply(clean_text)

data[["title", "cleaned_title"]].head()

Unnamed: 0,title,cleaned_title
0,A Light in the Attic,a light in the attic
1,Tipping the Velvet,tipping the velvet
2,Soumission,soumission
3,Sharp Objects,sharp objects
4,Sapiens: A Brief History of Humankind,sapiens a brief history of humankind


In [6]:
data[data["cleaned_title"].str.len()==0]

Unnamed: 0,title,price,rating,cleaned_title


In [7]:
data[data["cleaned_title"].str.len() > 0]

Unnamed: 0,title,price,rating,cleaned_title
0,A Light in the Attic,Â£51.77,3,a light in the attic
1,Tipping the Velvet,Â£53.74,1,tipping the velvet
2,Soumission,Â£50.10,1,soumission
3,Sharp Objects,Â£47.82,4,sharp objects
4,Sapiens: A Brief History of Humankind,Â£54.23,5,sapiens a brief history of humankind
...,...,...,...,...
95,Lumberjanes Vol. 3: A Terrible Plan (Lumberjan...,Â£19.92,2,lumberjanes vol a terrible plan lumberjanes
96,"Layered: Baking, Building, and Styling Spectac...",Â£40.11,1,layered baking building and styling spectacula...
97,Judo: Seven Steps to Black Belt (an Introducto...,Â£53.90,2,judo seven steps to black belt an introductory...
98,Join,Â£35.67,5,join


In [8]:
data["cleaned_title"].str.split().str.len().describe()

count    100.000000
mean       7.050000
std        5.133323
min        1.000000
25%        3.000000
50%        6.000000
75%        9.000000
max       26.000000
Name: cleaned_title, dtype: float64

## TRANSFORMATION DU RATING EN VARIABLE EXPLOITABLE

In [9]:
data["cleaned_rating"] = data["rating"].astype(str).str.lower().str.strip()

In [10]:
data[["rating", "cleaned_rating"]].drop_duplicates()

Unnamed: 0,rating,cleaned_rating
0,3,3
1,1,1
3,4,4
4,5,5
10,2,2


In [11]:
data['numeric_rating'] = pd.to_numeric(data['cleaned_rating'], errors='coerce')
data[['rating', 'numeric_rating']].head()

Unnamed: 0,rating,numeric_rating
0,3,3
1,1,1
2,1,1
3,4,4
4,5,5


In [12]:
def sentiment_label(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

In [13]:
data["sentiment"] = data['numeric_rating'].apply(sentiment_label)
data[["rating", "numeric_rating", "sentiment"]].head()

Unnamed: 0,rating,numeric_rating,sentiment
0,3,3,neutral
1,1,1,negative
2,1,1,negative
3,4,4,positive
4,5,5,positive


In [14]:
data["sentiment"].value_counts()

sentiment
negative    41
positive    37
neutral     22
Name: count, dtype: int64

In [36]:
data.to_csv("C:\\Users\\olake\\Desktop\\PROJETS_CAS_ENTREPRISE\\PROJET_1\\data\\donnees_nettoy\\books_review_nettoyees.csv", index=False)