In [41]:
import pandas as pd
# Load the dataset
file_path = 'training.1600000.processed.noemoticon.csv'
column_names = ['sentiment', 'id', 'date', 'query', 'user', 'text']
try:
    df = pd.read_csv(file_path, header=None, encoding='latin1', names=column_names)
except FileNotFoundError:
    print("Dataset file not found.")
    exit()
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()


  df = pd.read_csv(file_path, header=None, encoding='latin1', names=column_names)


In [42]:
# حذف ردیف با مقدار اشتباه
df = df[df['sentiment'].isin(['0', '4'])]

# تبدیل مقادیر رشته‌ای '0' و '4' به عدد صحیح 0 و 4
df['sentiment'] = df['sentiment'].replace('0', 0)
df['sentiment'] = df['sentiment'].replace('4', 4)

# تبدیل کل ستون 'sentiment' به نوع داده integer
df['sentiment'] = pd.to_numeric(df['sentiment'], errors='raise')

  df['sentiment'] = df['sentiment'].replace('0', 0)


In [43]:
# Convert text to lowercase
df['text_lower'] = df['text'].str.lower()
print("Sample DataFrame after converting to lowercase:")
print(df[['text', 'text_lower']].head())

Sample DataFrame after converting to lowercase:
                                                text  \
1  is upset that he can't update his Facebook by ...   
2  @Kenichan I dived many times for the ball. Man...   
3    my whole body feels itchy and like its on fire    
4  @nationwideclass no, it's not behaving at all....   
5                      @Kwesidei not the whole crew    

                                          text_lower  
1  is upset that he can't update his facebook by ...  
2  @kenichan i dived many times for the ball. man...  
3    my whole body feels itchy and like its on fire   
4  @nationwideclass no, it's not behaving at all....  
5                      @kwesidei not the whole crew   


In [44]:
import string
# Remove punctuation
punctuation = string.punctuation

def remove_punctuation(text):
    translator = str.maketrans('', '', punctuation)
    return text.translate(translator)

In [45]:
df['text_without_punctuation'] = df['text_lower'].apply(remove_punctuation)
print("\nSample DataFrame after removing punctuation:")
print(df[['text_lower', 'text_without_punctuation']].head())


Sample DataFrame after removing punctuation:
                                          text_lower  \
1  is upset that he can't update his facebook by ...   
2  @kenichan i dived many times for the ball. man...   
3    my whole body feels itchy and like its on fire    
4  @nationwideclass no, it's not behaving at all....   
5                      @kwesidei not the whole crew    

                            text_without_punctuation  
1  is upset that he cant update his facebook by t...  
2  kenichan i dived many times for the ball manag...  
3    my whole body feels itchy and like its on fire   
4  nationwideclass no its not behaving at all im ...  
5                       kwesidei not the whole crew   


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer

# ایجاد یک شیء از TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# آموزش و تبدیل متن پیش‌پردازش شده به ماتریس ویژگی TF-IDF **روی متن بدون علائم نگارشی**
X = tfidf_vectorizer.fit_transform(df['text_without_punctuation'])

# حالا X شامل ماتریس ویژگی TF-IDF برای توییت‌های شماست
print("شکل ماتریس ویژگی TF-IDF:", X.shape)
print("تعداد ویژگی‌ها (اندازه واژگان):", len(tfidf_vectorizer.vocabulary_))
print("نمونه‌ای از ویژگی‌های تبدیل شده (به صورت sparse matrix):")
print(X[:5])

# برای اینکه بفهمیم هر ستون در ماتریس X مربوط به چه کلمه‌ای است:
feature_names = tfidf_vectorizer.get_feature_names_out()
print("\nنمونه‌ای از نام ویژگی‌ها (کلمات واژگان):", feature_names[:20])

شکل ماتریس ویژگی TF-IDF: (131071, 114067)
تعداد ویژگی‌ها (اندازه واژگان): 114067
نمونه‌ای از ویژگی‌های تبدیل شده (به صورت sparse matrix):
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 69 stored elements and shape (5, 114067)>
  Coords	Values
  (0, 50439)	0.11036684136121382
  (0, 106172)	0.2621207295151251
  (0, 99587)	0.13470245050605564
  (0, 41545)	0.19564633891370647
  (0, 17037)	0.15064788544230587
  (0, 106053)	0.2734398809456602
  (0, 42812)	0.21405361227602984
  (0, 32914)	0.2731692907813496
  (0, 16361)	0.1971530196585546
  (0, 99439)	0.3420433926326614
  (0, 50622)	0.11697862421483782
  (0, 6478)	0.10538030093613089
  (0, 65804)	0.23765702595734384
  (0, 23311)	0.2493533943177669
  (0, 8317)	0.18907116088369239
  (0, 84685)	0.33399152864890724
  (0, 88287)	0.18697104494029865
  (0, 101957)	0.15508896905381747
  (0, 5743)	0.23517928732437446
  (0, 12968)	0.28958038212268006
  (1, 55564)	0.40104315066541396
  (1, 27460)	0.40104315066541396
  (1, 63084)	0.2097548

In [48]:
from sklearn.model_selection import train_test_split

# فرض کنید X ماتریس ویژگی TF-IDF و y ستون احساسات (sentiment) باشه
# اگر ستون احساسات شما اسم دیگه‌ای داره، اون رو جایگزین کنید
y = df['sentiment']

# تقسیم داده‌ها به آموزش و تست
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("شکل X_train:", X_train.shape)
print("شکل X_test:", X_test.shape)
print("شکل y_train:", y_train.shape)
print("شکل y_test:", y_test.shape)

شکل X_train: (104856, 114067)
شکل X_test: (26215, 114067)
شکل y_train: (104856,)
شکل y_test: (26215,)


In [50]:
from sklearn.naive_bayes import MultinomialNB

# ایجاد یک شیء از مدل MultinomialNB
naive_bayes_model = MultinomialNB()

# آموزش مدل با استفاده از داده‌های آموزشی
naive_bayes_model.fit(X_train, y_train)

print("مدل Naive Bayes با موفقیت آموزش داده شد.")

مدل Naive Bayes با موفقیت آموزش داده شد.
