In [1]:
# Numerical computation
import numpy as np
import pandas as pd

# Machine Learning & Data Processing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Deep Learning (Neural Networks)
import tensorflow as tf
from tensorflow import keras

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Check TensorFlow and Keras versions
print("TensorFlow Version:", tf.__version__)
print("Keras Version:", keras.__version__)

# Check installed versions of other libraries
import sklearn
print("Scikit-Learn Version:", sklearn.__version__)
import seaborn as sns
print("Seaborn Version:", sns.__version__)


ModuleNotFoundError: No module named 'numpy'

In [2]:
def preprocessed_tokens(text):
    filtered_text=re.sub(r'[^a-zA-Z0-9\s]','',text)
    filtered_text=filtered_text.split()
    filtered_text=[token.lower() for token in filtered_text]
    return filtered_text

In [4]:
def tokenize_text(data_text,min_frequency=5):
    review_tokens=[get_processed_tokens(review) for review in data_text]
    token_list=[token for review in review_tokens for token in review]
    token_freq_dict={token:token_list.count(token) for token in token_list}
    most_freq_tokens=[tokens for tokens in  token_freq_dict if token_freq_dict[tokens]>=min_frequency]
    idx=range(len(most_freq_tokens))

    token_idx=dict(zip(most_freq_tokens,idx))
    return token_idx,len(token_idx)


In [5]:
def get_max(data):
    tokens_per_review=[len(txt.split()) for txt in data]
    return max(tokens_per_review)


In [6]:
def create_sequence(data_text,token_idx,max_tokens):
    review_tokens=[get_processed_token(review) for review in data_text]
    review_token_idx=map(lambda review:[token_idx[k] for k in review if k in token_idx.keys()],
                         review_tokens)
    padded_sequences=padded_sequences(review_token_idx,maxlen=max_tokens)
    return np.array(padded_sequences)


In [8]:
def define_model(num_tokens,max_tokens):
    model=Sequential()
    model.add(Embedding(input_dim=num_tokens,output_dim=EMBEDDING_SIZE,input_length=max_tokens,name='layer_embedding'))
    model.add(GRU(units=16,name="gru_1",return_sequences=True))
    model.add(GRU(units=8,name="gru_2",return_sequences=True))
    model.add(GRU(units=4,name="gru_3"))
    model.add(Dense(1,activation='sigmoid',name="dense_1"))
    optimizer=Adam(learning_rate=0.001)
    model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
    print(model.summary())
    return model


In [9]:
def train_model(model,inout_sequence,y_train):
    model.fit(input_sequence,y_train,batch_size=BATCH_SIZE,epochs=EPOCHS,verbose=1)
    return model


In [None]:
'''import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

def load_data(file_path):
    """Load and preprocess the data"""
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Assuming the format is: text\tlabel
            text, label = line.strip().split('\t')
            data.append({'text': text, 'label': int(label)})
    return pd.DataFrame(data)

def preprocess_data(df):
    """Preprocess the text data"""
    # Convert labels to binary (assuming 0 is negative, 1 is positive)
    df['label'] = df['label'].map({0: 0, 1: 1})
    return df

def train_model(X_train, y_train):
    """Train the sentiment analysis model"""
    # Initialize TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    
    # Transform the training data
    X_train_tfidf = vectorizer.fit_transform(X_train)
    
    # Initialize and train the model
    model = LogisticRegression(random_state=42)
    model.fit(X_train_tfidf, y_train)
    
    return model, vectorizer

def evaluate_model(model, vectorizer, X_test, y_test):
    """Evaluate the model and create confusion matrix"""
    # Transform test data
    X_test_tfidf = vectorizer.transform(X_test)
    
    # Make predictions
    y_pred = model.predict(X_test_tfidf)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Create confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix.png')
    plt.close()

def main():
    # Load training data
    print("Loading training data...")
    train_df = load_data('Dataset/training.txt')
    train_df = preprocess_data(train_df)
    
    # Split data into features and labels
    X = train_df['text']
    y = train_df['label']
    
    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model
    print("Training the model...")
    model, vectorizer = train_model(X_train, y_train)
    
    # Evaluate on validation set
    print("\nEvaluating on validation set:")
    evaluate_model(model, vectorizer, X_val, y_val)
    
    # Load and evaluate on test data
    print("\nLoading and evaluating on test data...")
    test_df = load_data('Dataset/testdata.txt')
    test_df = preprocess_data(test_df)
    evaluate_model(model, vectorizer, test_df['text'], test_df['label'])

if __name__ == "__main__":
    main()'
    '''