In [53]:
import pandas as pd
# Load the dataset
file_path = 'training.1600000.processed.noemoticon.csv'
column_names = ['sentiment', 'id', 'date', 'query', 'user', 'text']
try:
    df = pd.read_csv(file_path, header=None, encoding='latin1', names=column_names)
except FileNotFoundError:
    print("Dataset file not found.")
    exit()
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()




  df = pd.read_csv(file_path, header=None, encoding='latin1', names=column_names)


In [54]:
# Convert text to lowercase
df['text_lower'] = df['text'].str.lower()
print("Sample DataFrame after converting to lowercase:")
print(df[['text', 'text_lower']].head())

Sample DataFrame after converting to lowercase:
                                                text  \
0                                 text of the tweet    
1  is upset that he can't update his Facebook by ...   
2  @Kenichan I dived many times for the ball. Man...   
3    my whole body feels itchy and like its on fire    
4  @nationwideclass no, it's not behaving at all....   

                                          text_lower  
0                                 text of the tweet   
1  is upset that he can't update his facebook by ...  
2  @kenichan i dived many times for the ball. man...  
3    my whole body feels itchy and like its on fire   
4  @nationwideclass no, it's not behaving at all....  


In [55]:
import string
# Remove punctuation
punctuation = string.punctuation

def remove_punctuation(text):
    translator = str.maketrans('', '', punctuation)
    return text.translate(translator)

In [56]:
df['text_without_punctuation'] = df['text_lower'].apply(remove_punctuation)
print("\nSample DataFrame after removing punctuation:")
print(df[['text_lower', 'text_without_punctuation']].head())


Sample DataFrame after removing punctuation:
                                          text_lower  \
0                                 text of the tweet    
1  is upset that he can't update his facebook by ...   
2  @kenichan i dived many times for the ball. man...   
3    my whole body feels itchy and like its on fire    
4  @nationwideclass no, it's not behaving at all....   

                            text_without_punctuation  
0                                 text of the tweet   
1  is upset that he cant update his facebook by t...  
2  kenichan i dived many times for the ball manag...  
3    my whole body feels itchy and like its on fire   
4  nationwideclass no its not behaving at all im ...  


In [57]:
import string
import nltk
from nltk.corpus import stopwords
# Download stopwords if not already downloaded
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

In [58]:
# Get the set of English stop words
stop_words = set(stopwords.words('english'))
print(f"تعداد stop words انگلیسی: {len(stop_words)}")
print(f"نمونه‌ای از stop words: {list(stop_words)[:10]}")

تعداد stop words انگلیسی: 198
نمونه‌ای از stop words: ["wasn't", 'out', 'which', 'he', 'wouldn', 'ma', "you'll", "i'd", 'ain', 'between']


In [59]:
# Function to remove stop words from text (using a for loop)
def remove_stopwords_loop(text):
    words = [word.lower() for word in text.split()] 
    filtered_words = []
    for word in words:
        if word not in stop_words:
            filtered_words.append(word)
    return " ".join(filtered_words)

In [60]:
df['text_without_stopwords'] = df['text_without_punctuation'].apply(remove_stopwords_loop)
print(df['text_without_stopwords'].head())

0                                           text tweet
1    upset cant update facebook texting might cry r...
2    kenichan dived many times ball managed save 50...
3                     whole body feels itchy like fire
4             nationwideclass behaving im mad cant see
Name: text_without_stopwords, dtype: object


In [61]:
df.to_csv('your_processed_data.csv', index=False)

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer

# ایجاد یک شیء از TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# آموزش و تبدیل متن پیش‌پردازش شده به ماتریس ویژگی TF-IDF
X = tfidf_vectorizer.fit_transform(df['text_without_stopwords'])

# حالا X شامل ماتریس ویژگی TF-IDF برای توییت‌های شماست
print("شکل ماتریس ویژگی TF-IDF:", X.shape)
print("تعداد ویژگی‌ها (اندازه واژگان):", len(tfidf_vectorizer.vocabulary_))
print("نمونه‌ای از ویژگی‌های تبدیل شده (به صورت sparse matrix):")
print(X[:5])

# برای اینکه بفهمیم هر ستون در ماتریس X مربوط به چه کلمه‌ای است:
feature_names = tfidf_vectorizer.get_feature_names_out()
print("\nنمونه‌ای از نام ویژگی‌ها (کلمات واژگان):", feature_names[:20])

شکل ماتریس ویژگی TF-IDF: (1048573, 591035)
تعداد ویژگی‌ها (اندازه واژگان): 591035
نمونه‌ای از ویژگی‌های تبدیل شده (به صورت sparse matrix):
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 37 stored elements and shape (5, 591035)>
  Coords	Values
  (0, 517253)	0.7534146107026727
  (0, 541552)	0.6575457583923267
  (1, 550511)	0.3020468584078402
  (1, 83797)	0.17495340835001688
  (1, 549986)	0.2949660103614437
  (1, 161552)	0.29947321881330696
  (1, 517306)	0.37419561316878136
  (1, 350558)	0.2638174305613934
  (1, 113645)	0.28353584892889
  (1, 445340)	0.37888949897909924
  (1, 463932)	0.22717498271163797
  (1, 530767)	0.1785791673814293
  (1, 27415)	0.2618772750164461
  (1, 64277)	0.34032237888179545
  (2, 295449)	0.4122176774879551
  (2, 134058)	0.45446374473680956
  (2, 335713)	0.20905810569952854
  (2, 527996)	0.21530677127381898
  (2, 50850)	0.27479542712155597
  (2, 334491)	0.2818731460234196
  (2, 462234)	0.2531635544933253
  (2, 8929)	0.2727317711455338
  (2, 445127)

In [63]:
from sklearn.model_selection import train_test_split

# فرض کنید X ماتریس ویژگی TF-IDF و y ستون احساسات (sentiment) باشه
# اگر ستون احساسات شما اسم دیگه‌ای داره، اون رو جایگزین کنید
y = df['sentiment']

# تقسیم داده‌ها به آموزش و تست
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("شکل X_train:", X_train.shape)
print("شکل X_test:", X_test.shape)
print("شکل y_train:", y_train.shape)
print("شکل y_test:", y_test.shape)

شکل X_train: (838858, 591035)
شکل X_test: (209715, 591035)
شکل y_train: (838858,)
شکل y_test: (209715,)


In [65]:
from sklearn.naive_bayes import MultinomialNB

# ایجاد یک شیء از مدل MultinomialNB
naive_bayes_model = MultinomialNB()

# آموزش مدل با استفاده از داده‌های آموزشی
naive_bayes_model.fit(X_train, y_train)

print("مدل Naive Bayes با موفقیت آموزش داده شد.")

TypeError: '<' not supported between instances of 'str' and 'int'