In [None]:
%pip install nltk


In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer

# Download NLTK resources (if not already downloaded)
nltk.download('punkt')

# Custom list of Urdu stop words
stop_words_urdu = [
    'ہے', 'کے', 'کا', 'کی', 'ہوتا', 'کو', 'سے', 'پر', 'ہیں', 'کا', 'کر', 'کے', 'لئے', 'سب', 'سے', 'ہوشربا', 'یہ', 'اور', 'یا', 'تک', 'کی', 'میں', 'بھی', 'آپ', 'کا', 'ہی', 'کوئی', 'یہ', 'کر', 'رہے', 'تھا', 'کر', 'لیا', 'آپ', 'کے', 'لئے', 'کر', 'لیا', 'لیکن', 'کیا'
]

# Assuming your CSV file is named 'stories.csv'
file_path = 'stories.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path, header=None, names=['Story'])

# Split the 'Story' column based on commas, handling NaN values
df['Story'] = df['Story'].apply(lambda x: x.split(',') if pd.notna(x) else [])

# Flatten the list of stories
df = df.explode('Story')

# Remove leading and trailing whitespaces from the story text
df['Story'] = df['Story'].str.strip()

# Remove Urdu stop words, handling NaN values
df['Story'] = df['Story'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words_urdu]) if isinstance(x, str) else x)

# Custom list of pronouns and other words to exclude
custom_stop_words = ['اچھا', 'بہت', 'یہ', 'تم', 'میں', 'کون', 'کیا', 'میں', 'تم', 'آپ', 'ہم', 'آپ', 'وہ', 'یہ']

# Remove custom stop words
df['Story'] = df['Story'].apply(lambda x: ' '.join([word for word in x.split() if word not in custom_stop_words]) if isinstance(x, str) else x)

# Stemming
stemmer = PorterStemmer()
df['Story'] = df['Story'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]) if isinstance(x, str) else x)

# Split into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Display the cleaned and stemmed DataFrame
print(train_df.head())

# Save the cleaned and stemmed training DataFrame to a new CSV file
train_file_path = 'train_stories.csv'
train_df.to_csv(train_file_path, index=False)

# Display the cleaned and stemmed testing DataFrame
print(test_df.head())

# Save the cleaned and stemmed testing DataFrame to a new CSV file
test_file_path = 'test_stories.csv'
test_df.to_csv(test_file_path, index=False)


In [5]:
%pip install --user transformers 


Collecting transformers
  Using cached transformers-4.35.2-py3-none-any.whl.metadata (123 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Using cached huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Using cached tokenizers-0.15.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.16.4->transformers)
  Using cached fsspec-2023.12.0-py3-none-any.whl.metadata (6.8 kB)
Collecting packaging>=20.0 (from transformers)
  Using cached packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Using cached transformers-4.35.2-py3-none-any.whl (7.9 MB)
Using cached huggingface_hub-0.19.4-py3-none-any.whl (311 kB)
Using cached packaging-23.2-py3-none-any.whl (53 kB)
Using cached tokenizers-0.15.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
Using cached fsspec-2023.12.0-py3-none-any.whl (168 kB)
Installing collected package

In [1]:
%pip install torch

Collecting torch
  Downloading torch-2.1.1-cp38-cp38-manylinux1_x86_64.whl.metadata (25 kB)
Collecting sympy (from torch)
  Downloading sympy-1.12-py3-none-any.whl (5.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.7/5.7 MB[0m [31m761.7 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting networkx (from torch)
  Downloading networkx-3.1-py3-none-any.whl (2.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:02[0mm
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvi

In [3]:
from transformers import BertModel, BertTokenizer
import torch

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-multilingual-uncased'  # You can replace this with the specific model name you want to use
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


ModuleNotFoundError: No module named 'transformers'