In [None]:
import pandas as pd

# Load dataset (adjust the path accordingly)
df = pd.read_csv('/content/raja.csv')

# Check the first few rows
df

Unnamed: 0,input,output,genre
0,A SHIFTING REEF\n\nThe year 1866 was signalise...,"The story follows Professor Pierre Aronnax, hi...",Fiction
1,"""ARE you for a walk,"" said Montraville to his ...","Charlotte Temple"" is a cautionary tale about a...",Romance
2,The studio was filled with the rich odour of r...,"""The Picture of Dorian Gray"" is Oscar Wilde's ...",Horror
3,Prologue\n\n\nIN WHICH THE AUTHOR OF THIS SING...,"""The Phantom of the Opera"" is a Gothic novel f...",Horror
4,"PART I\n\n\nAt the little town of Vevey, in Sw...","Certainly. Here's a summary of ""Daisy Miller"" ...",Fiction
5,"The story had held us, round the fire, suffici...",The story begins with a framing device: guests...,Thriller
6,"THE EARLY MARRIED LIFE OF THE MORELS\n\n""THE B...","Walter Morel is a coal miner, while Gertrude c...",Thriller
7,"The Nellie, a cruising yawl, swung to her anch...",Framing narrative:\nThe story begins on a boat...,Fiction
8,I\n\n\nOn my right hand there were lines of fi...,"""The Secret Sharer"" is a short story by Joseph...",Fiction
9,I.--The Adventure of the Empty House.\n\n\n\nI...,The Empty House\n\nSherlock Holmes returns fro...,Fiction


In [None]:
columns_to_lowercase = ['input', 'output', 'genre']  # List of columns to convert

for column in columns_to_lowercase:
    df.loc[:, column] = df[column].str.lower()

df

Unnamed: 0,input,output,genre
0,a shifting reef\n\nthe year 1866 was signalise...,"the story follows professor pierre aronnax, hi...",fiction
1,"""are you for a walk,"" said montraville to his ...","charlotte temple"" is a cautionary tale about a...",romance
2,the studio was filled with the rich odour of r...,"""the picture of dorian gray"" is oscar wilde's ...",horror
3,prologue\n\n\nin which the author of this sing...,"""the phantom of the opera"" is a gothic novel f...",horror
4,"part i\n\n\nat the little town of vevey, in sw...","certainly. here's a summary of ""daisy miller"" ...",fiction
5,"the story had held us, round the fire, suffici...",the story begins with a framing device: guests...,thriller
6,"the early married life of the morels\n\n""the b...","walter morel is a coal miner, while gertrude c...",thriller
7,"the nellie, a cruising yawl, swung to her anch...",framing narrative:\nthe story begins on a boat...,fiction
8,i\n\n\non my right hand there were lines of fi...,"""the secret sharer"" is a short story by joseph...",fiction
9,i.--the adventure of the empty house.\n\n\n\ni...,the empty house\n\nsherlock holmes returns fro...,fiction


In [None]:
for column in columns_to_lowercase:
    df[column] = df[column].apply(lambda x: x.lower() if isinstance(x, str) else x)


In [None]:
print(df.head())

                                               input  \
0  a shifting reef\n\nthe year 1866 was signalise...   
1  "are you for a walk," said montraville to his ...   
2  the studio was filled with the rich odour of r...   
3  prologue\n\n\nin which the author of this sing...   
4  part i\n\n\nat the little town of vevey, in sw...   

                                              output    genre  
0  the story follows professor pierre aronnax, hi...  fiction  
1  charlotte temple" is a cautionary tale about a...  romance  
2  "the picture of dorian gray" is oscar wilde's ...   horror  
3  "the phantom of the opera" is a gothic novel f...   horror  
4  certainly. here's a summary of "daisy miller" ...  fiction  


In [None]:
import re

# Define a cleaning function
def clean_text(text):
    if isinstance(text, str):  # Ensure the input is a string
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\s+', ' ', text)     # Remove extra whitespace
        return text.strip()
    return text  # Return original if not a string

# List of columns to clean
columns_to_clean = ['input', 'output', 'genre']

# Apply the cleaning function to multiple columns
for column in columns_to_clean:
    df[column] = df[column].apply(clean_text)

# Verify the changes
print(df.head())


                                               input  \
0  a shifting reef the year 1866 was signalised b...   
1  are you for a walk said montraville to his com...   
2  the studio was filled with the rich odour of r...   
3  prologue in which the author of this singular ...   
4  part i at the little town of vevey in switzerl...   

                                              output    genre  
0  the story follows professor pierre aronnax his...  fiction  
1  charlotte temple is a cautionary tale about a ...  romance  
2  the picture of dorian gray is oscar wildes onl...   horror  
3  the phantom of the opera is a gothic novel fir...   horror  
4  certainly heres a summary of daisy miller by h...  fiction  


In [None]:
import nltk

# Download the 'punkt' resource for tokenization
nltk.download('punkt')

from nltk.tokenize import word_tokenize

# List of columns to tokenize
columns_to_tokenize = ['input', 'output', 'genre']

# Apply word_tokenize to multiple columns
for column in columns_to_tokenize:
    df[column + '_tokens'] = df[column].apply(lambda x: word_tokenize(x) if isinstance(x, str) else [])

# Verify the changes
print(df.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


                                               input  \
0  a shifting reef the year 1866 was signalised b...   
1  are you for a walk said montraville to his com...   
2  the studio was filled with the rich odour of r...   
3  prologue in which the author of this singular ...   
4  part i at the little town of vevey in switzerl...   

                                              output    genre  \
0  the story follows professor pierre aronnax his...  fiction   
1  charlotte temple is a cautionary tale about a ...  romance   
2  the picture of dorian gray is oscar wildes onl...   horror   
3  the phantom of the opera is a gothic novel fir...   horror   
4  certainly heres a summary of daisy miller by h...  fiction   

                                        input_tokens  \
0  [a, shifting, reef, the, year, 1866, was, sign...   
1  [are, you, for, a, walk, said, montraville, to...   
2  [the, studio, was, filled, with, the, rich, od...   
3  [prologue, in, which, the, author, of, this, 

In [None]:
from nltk.corpus import stopwords
import nltk

# Download stopwords if not already available
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# List of token columns to clean
token_columns = ['input_tokens', 'output_tokens', 'genre_tokens']

# Remove stopwords from multiple token columns
for column in token_columns:
    df[column] = df[column].apply(lambda x: [word for word in x if word not in stop_words] if isinstance(x, list) else [])

# Verify the changes
print(df.head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                                               input  \
0  a shifting reef the year 1866 was signalised b...   
1  are you for a walk said montraville to his com...   
2  the studio was filled with the rich odour of r...   
3  prologue in which the author of this singular ...   
4  part i at the little town of vevey in switzerl...   

                                              output    genre  \
0  the story follows professor pierre aronnax his...  fiction   
1  charlotte temple is a cautionary tale about a ...  romance   
2  the picture of dorian gray is oscar wildes onl...   horror   
3  the phantom of the opera is a gothic novel fir...   horror   
4  certainly heres a summary of daisy miller by h...  fiction   

                                        input_tokens  \
0  [shifting, reef, year, 1866, signalised, remar...   
1  [walk, said, montraville, companion, arose, ta...   
2  [studio, filled, rich, odour, roses, light, su...   
3  [prologue, author, singular, work, informs, r

In [None]:
from nltk.stem import WordNetLemmatizer
import nltk

# Download the required resources if not already available
nltk.download('wordnet')

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# List of token columns to lemmatize
token_columns = ['input_tokens', 'output_tokens', 'genre_tokens']

# Lemmatize tokens in multiple columns
for column in token_columns:
    df[column] = df[column].apply(lambda x: [lemmatizer.lemmatize(word) for word in x] if isinstance(x, list) else [])

# Verify the changes
print(df.head())


[nltk_data] Downloading package wordnet to /root/nltk_data...


                                               input  \
0  a shifting reef the year 1866 was signalised b...   
1  are you for a walk said montraville to his com...   
2  the studio was filled with the rich odour of r...   
3  prologue in which the author of this singular ...   
4  part i at the little town of vevey in switzerl...   

                                              output    genre  \
0  the story follows professor pierre aronnax his...  fiction   
1  charlotte temple is a cautionary tale about a ...  romance   
2  the picture of dorian gray is oscar wildes onl...   horror   
3  the phantom of the opera is a gothic novel fir...   horror   
4  certainly heres a summary of daisy miller by h...  fiction   

                                        input_tokens  \
0  [shifting, reef, year, 1866, signalised, remar...   
1  [walk, said, montraville, companion, arose, ta...   
2  [studio, filled, rich, odour, rose, light, sum...   
3  [prologue, author, singular, work, informs, r

In [None]:
print(df.head())

                                               input  \
0  a shifting reef the year 1866 was signalised b...   
1  are you for a walk said montraville to his com...   
2  the studio was filled with the rich odour of r...   
3  prologue in which the author of this singular ...   
4  part i at the little town of vevey in switzerl...   

                                              output    genre  \
0  the story follows professor pierre aronnax his...  fiction   
1  charlotte temple is a cautionary tale about a ...  romance   
2  the picture of dorian gray is oscar wildes onl...   horror   
3  the phantom of the opera is a gothic novel fir...   horror   
4  certainly heres a summary of daisy miller by h...  fiction   

                                        input_tokens  \
0  [shifting, reef, year, 1866, signalised, remar...   
1  [walk, said, montraville, companion, arose, ta...   
2  [studio, filled, rich, odour, rose, light, sum...   
3  [prologue, author, singular, work, informs, r

In [None]:
df.to_csv('preprocessed_dataset.csv', index=False)

In [None]:
df=pd.read_csv('/content/preprocessed_dataset.csv')
df

Unnamed: 0,input,output,genre,input_tokens,output_tokens,genre_tokens
0,a shifting reef the year 1866 was signalised b...,the story follows professor pierre aronnax his...,fiction,"['shifting', 'reef', 'year', '1866', 'signalis...","['story', 'follows', 'professor', 'pierre', 'a...",['fiction']
1,are you for a walk said montraville to his com...,charlotte temple is a cautionary tale about a ...,romance,"['walk', 'said', 'montraville', 'companion', '...","['charlotte', 'temple', 'cautionary', 'tale', ...",['romance']
2,the studio was filled with the rich odour of r...,the picture of dorian gray is oscar wildes onl...,horror,"['studio', 'filled', 'rich', 'odour', 'rose', ...","['picture', 'dorian', 'gray', 'oscar', 'wilde'...",['horror']
3,prologue in which the author of this singular ...,the phantom of the opera is a gothic novel fir...,horror,"['prologue', 'author', 'singular', 'work', 'in...","['phantom', 'opera', 'gothic', 'novel', 'first...",['horror']
4,part i at the little town of vevey in switzerl...,certainly heres a summary of daisy miller by h...,fiction,"['part', 'little', 'town', 'vevey', 'switzerla...","['certainly', 'here', 'summary', 'daisy', 'mil...",['fiction']
5,the story had held us round the fire sufficien...,the story begins with a framing device guests ...,thriller,"['story', 'held', 'u', 'round', 'fire', 'suffi...","['story', 'begin', 'framing', 'device', 'guest...",['thriller']
6,the early married life of the morels the botto...,walter morel is a coal miner while gertrude co...,thriller,"['early', 'married', 'life', 'morel', 'bottom'...","['walter', 'morel', 'coal', 'miner', 'gertrude...",['thriller']
7,the nellie a cruising yawl swung to her anchor...,framing narrative the story begins on a boat o...,fiction,"['nellie', 'cruising', 'yawl', 'swung', 'ancho...","['framing', 'narrative', 'story', 'begin', 'bo...",['fiction']
8,i on my right hand there were lines of fishing...,the secret sharer is a short story by joseph c...,fiction,"['right', 'hand', 'line', 'fishing', 'stake', ...","['secret', 'sharer', 'short', 'story', 'joseph...",['fiction']
9,ithe adventure of the empty house it was in th...,the empty house sherlock holmes returns from t...,fiction,"['ithe', 'adventure', 'empty', 'house', 'sprin...","['empty', 'house', 'sherlock', 'holmes', 'retu...",['fiction']
